for (i in 1:nrow(surgeries_7)){
count = 0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
有两个表:Surgery_7是一个表:它有两列PatientProfileId(unique),每个对应的配置文件ID都有手术日期。
第二个表是“访问量”表,其中有针对不同访问的配置文件ID(存在多个具有相同配置文件ID的条目)。
我们正在尝试计算手术日期(手术_7表中存在)之后但手术日期后一年内的访问次数(每个配置文件ID的条目)。
问题是代码花费的时间太长,无法运行约6k行。有什么方法可以使循环更快?
答案 0 :(得分:0)
我同意Jonathan V.Solórzano的观点,请尝试从dplyr
包中省略函数。
这是您脚本的一些改进。
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# vectorization and pre-allocation dramatically improves speed on large data.
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
for (i in 1:nrow(surgeries_7)){
count=0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
如果您有多核计算机,也可以尝试使用foreach
+ doParallel
进行并行处理嵌套循环
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
library(foreach)
library(doParallel)
cl <- parallel::makeCluster(4) # for 4 cores machine
doParallel::registerDoParallel(cl)
post_op_visits <- foreach(i=1:nrow(surgeries_7), .combine='rbind') %dopar% {
foreach(j=1:nrow(visits_1), .combine='c') %do% {
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] + count
}
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
#close parallel backend
parallel::stopCluster(cl)
最良好的祝愿-艾哈迈德·阿伦迪
答案 1 :(得分:0)
请考虑使用块状处理(特别是merge
,subset
和aggregate
来避免循环和处理。以下假设患者一年内进行的手术不超过一次,可能会增加就诊次数。
# MERGE
merged_df <- merge(surgeries_7, visits_1, by = "PatientProfileId")
# SUBSET
sub_df <- subset(merged_df, visit_date > surgery_date &
visit_date <= one_year_from_surgery)
# AGGREGATE ACROSS ALL PATIENT SURGERIES
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId,
sub_df, FUN = length)
# AGGREGATE BY PATIENT AND SURGERY
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId + surgery_date,
sub_df, FUN = length)
是否需要将结果添加为新列,只需将聚合合并到原始数据框即可:
survery7 <- merge(surgery7, agg_df, by = c("PatientProfileId", "surgery_date"))
答案 2 :(得分:0)
在data.table
程序包中使用非等价联接的选项:
#calculate date one year after surgery
surgery_7[, oneyr := as.IDate(sapply(surgery_date, function(x)
seq(x, by="1 year", length.out=2L)[2L]))]
#update by reference
surgery_7[, post_op_visits :=
#non-equi join
visits_1[.SD, on=.(PatientProfileId, visit_date>=surgery_date, visit_date<=oneyr),
#for each row of surgery_7 find the number of rows from visits_1
by=.EACHI, .N]$N]
输出surgery_7
:
PatientProfileId surgery_date oneyr post_op_visits
1: 1 2018-01-01 2019-01-01 2
2: 2 2019-01-01 2020-01-01 1
数据:
library(data.table)
surgery_7 <- data.table(PatientProfileId=c(1,2),
surgery_date=as.IDate(c("2018-01-01", "2019-01-01")))
# PatientProfileId surgery_date
#1: 1 2018-01-01
#2: 2 2019-01-01
visits_1 <- data.table(PatientProfileId=c(1,1,1,2,2),
visit_date=as.IDate(c("2018-03-15","2018-09-15","2019-02-03","2019-06-30","2020-01-15")))
# PatientProfileId visit_date
# 1: 1 2018-03-15
# 2: 1 2018-09-15
# 3: 1 2019-02-03
# 4: 2 2019-06-30
# 5: 2 2020-01-15