我将原始数据框切成5249行的块(没有特殊原因),为了加快处理速度,尝试计算整个500,000多行是花费太长时间。
我创建了一个函数,以便根据已发现的匹配条件计算(缺失数据的插补)$ STUDYID_SUBJID == studyid_subjid,但是结果/返回的数据帧比输入数据帧有更多的数据/行(绝对是我的错误)?
这是我的功能:
specialImpute <- function(inputDF)
{
discoveredDf <- data.frame(STUDYID_SUBJID=character(), stringsAsFactors=FALSE)
dfList <- list()
counter = 1;
Whilecounter = nrow(inputDF)
print(colnames(inputDF))
#for testing just do 10 iterations,i = 10;
while (Whilecounter >0)
{
studyid_subjid=inputDF[Whilecounter,"STUDYID_SUBJID"]
vect = which(discoveredDf$STUDYID_SUBJID == studyid_subjid)
#was discovered and subset before
if (!is.null(vect))
{
#not subset before
if (length(vect)<1)
{
# if the discovered df has STUDYID_SUBJID then do nothing . else subset & add to discovered Df
#vect =which(inputDF$STUDYID_SUBJID == studyid_subjid)
#df <- inputDF[vect, ]
print(paste("studyid_subjid:",studyid_subjid,sep=""))
df <- subset(inputDF, regexpr(studyid_subjid, inputDF$STUDYID_SUBJID) > 0)
#impute using mean for CONTINUOUS variables
df[is.na(df$COVAR_CONTINUOUS_2), "COVAR_CONTINUOUS_2"] = mean(df$COVAR_CONTINUOUS_2, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_3), "COVAR_CONTINUOUS_3"] = mean(df$COVAR_CONTINUOUS_3, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_4), "COVAR_CONTINUOUS_4"] = mean(df$COVAR_CONTINUOUS_4, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_5), "COVAR_CONTINUOUS_5"] = mean(df$COVAR_CONTINUOUS_5, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_6), "COVAR_CONTINUOUS_6"] = mean(df$COVAR_CONTINUOUS_6, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_7), "COVAR_CONTINUOUS_7"] = mean(df$COVAR_CONTINUOUS_7, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_10), "COVAR_CONTINUOUS_10"] = mean(df$COVAR_CONTINUOUS_10, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_14), "COVAR_CONTINUOUS_14"] = mean(df$COVAR_CONTINUOUS_14, na.rm=TRUE)
df[is.na(df$COVAR_CONTINUOUS_30), "COVAR_CONTINUOUS_30"] = mean(df$COVAR_CONTINUOUS_30, na.rm=TRUE)
#impute using mode ordinal & nominal values
df[is.na(df$COVAR_ORDINAL_1), "COVAR_ORDINAL_1"] = mean(df$COVAR_ORDINAL_1, na.rm=TRUE)
df[is.na(df$COVAR_ORDINAL_2), "COVAR_ORDINAL_2"] = mean(df$COVAR_ORDINAL_2, na.rm=TRUE)
df[is.na(df$COVAR_ORDINAL_3), "COVAR_ORDINAL_3"] = mean(df$COVAR_ORDINAL_3, na.rm=TRUE)
df[is.na(df$COVAR_ORDINAL_4), "COVAR_ORDINAL_4"] = mean(df$COVAR_ORDINAL_4, na.rm=TRUE)
#impute using mode ordinal & nominal values
df[is.na(df$COVAR_NOMINAL_1), "COVAR_NOMINAL_1"] = mean(df$COVAR_NOMINAL_1, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_2), "COVAR_NOMINAL_2"] = mean(df$COVAR_NOMINAL_2, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_3), "COVAR_NOMINAL_3"] = mean(df$COVAR_NOMINAL_3, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_4), "COVAR_NOMINAL_4"] = mean(df$COVAR_NOMINAL_4, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_5), "COVAR_NOMINAL_5"] = mean(df$COVAR_NOMINAL_5, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_6), "COVAR_NOMINAL_6"] = mean(df$COVAR_NOMINAL_6, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_7), "COVAR_NOMINAL_7"] = mean(df$COVAR_NOMINAL_7, na.rm=TRUE)
df[is.na(df$COVAR_NOMINAL_8), "COVAR_NOMINAL_8"] = mean(df$COVAR_NOMINAL_8, na.rm=TRUE)
#logical (T ot F) was transformed T=1 & F=2 , to impute use Mode()
df[is.na(df$COVAR_y1_MISSING), "COVAR_y1_MISSING"] = mean(df$COVAR_y1_MISSING, na.rm=TRUE)
df[is.na(df$COVAR_y2_MISSING), "COVAR_y2_MISSING"] = mean(df$COVAR_y2_MISSING, na.rm=TRUE)
df[is.na(df$COVAR_y3_MISSING), "COVAR_y3_MISSING"] = mean(df$COVAR_y3_MISSING, na.rm=TRUE)
counter = counter +1;
if (counter %% 100 == 0)
{
print(counter)
}
}
}
Whilecounter = Whilecounter -1;
globalTrainingDf <- rbind(globalTrainingDf,df)
}
#end while
#union the data frame
return(globalTrainingDf)
}
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
我回来了:
nrow(training_df_columnn_subset_imputed)
[1] 105892
虽然它应该是5249行???
帮助将不胜感激,谢谢。
答案 0 :(得分:1)
问题在于你的
subset(inputDF, regexpr(studyid_subjid, inputDF$STUDYID_SUBJID) > 0)
语句。 regexpr是贪婪的,它将匹配所有出现的模式,而不仅仅是完全匹配。在没有看到您的数据的情况下,我怀疑存在多个匹配,其中仅预期完全匹配。如下例所示:
list<-c("example1", "example2", "example10")
regexpr("example1", list)>0
#[1] TRUE FALSE TRUE
返回两个TRUE,而不仅仅是预期的第一个TRUE。我建议改写语句,也许使用which
语句:
which("example1"== list)
祝你好运!