在多核上应用性能

时间:2018-07-23 09:14:17

标签: r parallel-processing mapply mclapply

我有一个要在大约300万个数据点上运行的功能。我正在尝试在具有8个内核的 Ubuntu 计算机上使用mcmapply并行化该功能。该函数接收长度为300万的list以及长度为300万的另外3个向量和1个常数cutoffyearmon

该代码在2分钟内在单个内核上可以完美处理100000行数据,并且不会引发任何错误。但是,当我尝试使用mcmapply在计算机的6个内核上并行运行代码时,它将继续运行5个小时以上。

更新:这是我的函数调用的精简版本。我为1个月,2个月和3个月的持续时间创建了9个变量。我只将时间设为6个月和1年。

我正在使用以下函数调用:

abc_xx_last_xxx_days=mcmapply(function(abcstrnew,sd,naflag,empflag,daysdiff,cutoffyearmon){
abcstrnew=if((!naflag) & (!empflag)){
    substring(text = abcstrnew,first = seq(from = 1,to = (nchar(abcstrnew)-2),by = 3),last = seq(from = 3,to = (nchar(abcstrnew)),by = 3))
}else{
    if(!is.na(empflag) & empflag){
        ""
    }else{
        NA_character_
    }
}

abcstrnew=if((!naflag) & (!empflag)){
    as.numeric(abcstrnew)
}else{
    if(!is.na(empflag) & empflag){
        as.numeric(0)
    }else{
        NA_real_
    }
}
if(is.na(daysdiff)){
  return(list(worst_abc_ever=NA_real_,
              times_abc=NA_real_,
              times_abc_last_180_days=NA_real_,
              times_abc_last_365_days=NA_real_,
              times_abc30_last_365_days=NA_real_,
              times_abc30_last_180_days=NA_real_,
              times_abc60_last_365_days=NA_real_,
              times_abc60_last_180_days=NA_real_,
              abc_last_180_days=NA_real_,
              abc_last_365_days=NA_real_
  ))
}else{
  if((!naflag)&(!empflag)){
    abcstrlen=length(abcstrnew)
    worst_abc_ever=max(abcstrnew)
    times_abc=as.numeric(length(which(abcstrnew>0)))

    if(daysdiff>365){
      abc_last_365_days=as.numeric(0)
      times_abc30_last_365_days=as.numeric(0)
      times_abc60_last_365_days=as.numeric(0)
      times_abc_last_365_days=as.numeric(0)
    }else{
      abcmonthstwelve=12-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)

      if(abcstrlen>=abcmonthstwelve){
        abc_last_365_days=(max(abcstrnew[1:abcmonthstwelve]))
      }else{
        abc_last_365_days=(max(abcstrnew[1:abcstrlen]))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=30)))
      }else{
        times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=60)))
      }else{
        times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>0)))
      }else{
        times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
      }
    }


    if(daysdiff>180){
      abc_last_180_days=as.numeric(0)
      times_abc30_last_180_days=as.numeric(0)
      times_abc60_last_180_days=as.numeric(0)
      times_abc_last_180_days=as.numeric(0)
    }else{
      abcmonthssix=6-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)

      if(abcstrlen>=abcmonthssix){
        abc_last_180_days=(max(abcstrnew[1:abcmonthssix]))
      }else{
        abc_last_180_days=(max(abcstrnew[1:abcstrlen]))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=30)))
      }else{
        times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=60)))
      }else{
        times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>0)))
      }else{
        times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
      }

    }

    return(list(worst_abc_ever=worst_abc_ever,
                times_abc=times_abc,
                times_abc_last_180_days=times_abc_last_180_days,
                times_abc_last_365_days=times_abc_last_365_days,
                times_abc30_last_365_days=times_abc30_last_365_days,
                times_abc30_last_180_days=times_abc30_last_180_days,
                times_abc60_last_365_days=times_abc60_last_365_days,
                times_abc60_last_180_days=times_abc60_last_180_days,
                abc_last_180_days=abc_last_180_days,
                abc_last_365_days=abc_last_365_days
    ))
  }else{
    return(list(worst_abc_ever=NA_real_,
                times_abc=NA_real_,
                times_abc_last_180_days=NA_real_,
                times_abc_last_365_days=NA_real_,
                times_abc30_last_365_days=NA_real_,
                times_abc30_last_180_days=NA_real_,
                times_abc60_last_365_days=NA_real_,
                times_abc60_last_180_days=NA_real_,
                abc_last_180_days=NA_real_,
                abc_last_365_days=NA_real_
    ))
  }
}
},lst,sd,naflag,empflag,daysdiff,cutoffyearmon,mc.cores=6, mc.preschedule=TRUE, mc.cleanup=TRUE)

您可以使用以下输入集来运行该功能并检查其输出。

lst=list("000050000032","000000340000000000000")
sd=c(as.Date.character("2017-05-22"),as.Date.character("2017-04-23"))
empflag=c(FALSE,FALSE)
naflag=c(FALSE,FALSE)
daysdiff=difftime(time1 = as.Date.character("2017-06-30"),time2 = sd)
cutoffyearmon=as.yearmon("2017-06-30")

我假设代码将通过分配mc.preschedule=TRUE在6个内核之间几乎相等地划分数据。但是我看不到任何处理速度方面的显着性能。我预计在该计算机的6个内核上运行时,处理将在1.5个小时左右完成。

如果我错过了任何东西,任何建议。

将pbmcmapply与mc.cores=6一起使用时,我的预计到达时间为06:01:32:57

0 个答案:

没有答案