有条件地将一列列替换为其他列列

时间:2018-04-20 18:08:47

标签: r data.table

我的data.table

set.seed(12345)
require(data.table)
dt <- data.table(col1 = rnorm(n=10, mean=20, sd=5), col2 = runif(10, 0, 1), col3 = rbinom(10, 3, .5),
             col4 = rnorm(n=10, mean=20, sd=5), col5 = runif(10, 0, 1), col6 = rbinom(10, 3, .5))
nas1 <- which(dt[,col1] %in% sample(dt[,col1], 3))
nas2 <- which(dt[,col2] %in% sample(dt[,col2], 4))
nas3 <- which(dt[,col3] %in% sample(dt[,col3], 2))

dt[nas1, col1 := NA]
dt[nas2, col2 := NA]
dt[nas3, col3 := NA]

我想将NAscol1col2中的col3替换为来自col4col5,{{1}的值分别。

给出

col6

我知道我可以像cols.tochange <- c("col1", "col2", "col3") .sdcols <- c("col4", "col5", "col6") 这样替换cols.tochange

.sdcols

但是我可以使用类似的语法来替换dt[, (cols.tochange) := .SD, .SDcols = .sdcols] 吗?

3 个答案:

答案 0 :(得分:4)

这是mapply和矢量化ifelse

dt[, 
   (cols.tochange) := mapply(function(x, y) 
                       ifelse(is.na(x), y, x), .SD, dt[, ...sdcols], SIMPLIFY = F),
   .SDcols = cols.tochange]

print(dt)
#         col1      col2 col3     col4      col5 col6
#  1: 23.89811 0.7915678    2 23.89811 0.7915678    2
#  2: 23.54733 0.2586843    0 27.27893 0.2586843    2
#  3: 19.45348 0.9859838    1 16.77836 0.9859838    1
#  4: 17.73251 0.7074819    2 12.23431 0.7568737    1
#  5: 12.01145 0.6445426    0 12.01145 0.9797782    0
#  6: 10.91022 0.2189478    2 29.02549 0.2189478    2
#  7: 23.15049 0.6985436    2 17.59176 0.9487072    3
#  8: 18.61908 0.5440579    2 23.10190 0.1494579    2
#  9: 23.06062 0.2264672    2 23.06062 0.6003570    2
# 10: 15.40339 0.4845578    1 19.18845 0.9464308    1

答案 1 :(得分:3)

我们可以利用data.table通过引用更新这一事实,因此您实际上甚至不需要使用dt[...]执行此操作。你可以这样做:

Map(function(x,y) {dt[, (x) := replace(dt[[x]], is.na(dt[[x]]), dt[[y]][is.na(dt[[x]])])]; return(NULL)},
    cols.tochange,
    .sdcols)
dt
#        col1      col2 col3     col4      col5 col6
# 1: 23.89811 0.7915678    2 23.89811 0.7915678    2
# 2: 23.54733 0.2586843    0 27.27893 0.2586843    2
# 3: 19.45348 0.9859838    1 16.77836 0.9859838    1
# 4: 17.73251 0.7074819    2 12.23431 0.7568737    1
# 5: 12.01145 0.6445426    0 12.01145 0.9797782    0
# 6: 10.91022 0.2189478    2 29.02549 0.2189478    2
# 7: 23.15049 0.6985436    2 17.59176 0.9487072    3
# 8: 18.61908 0.5440579    2 23.10190 0.1494579    2
# 9: 23.06062 0.2264672    2 23.06062 0.6003570    2
#10: 15.40339 0.4845578    1 19.18845 0.9464308    1

或者@Frank建议,for循环set。一个想法是:

for(ind in seq_along(.sdcols)){
  set(dt, i = which(is.na(dt[[cols.tochange[ind]]])), j = cols.tochange[ind], value = dt[[.sdcols[ind]]][is.na(dt[[cols.tochange[ind]]])])
}

我希望这会很快

答案 2 :(得分:0)

将dt转换为data.frame,然后使用ifelse语句:

dt <- data.frame(dt)

for(i in 1:length(dt[,1])){
  dt[i,1] <- ifelse(is.na(dt[i,1]), dt[i,4], dt[i,1])
  dt[i,2] <- ifelse(is.na(dt[i,2]), dt[i,5], dt[i,2])
  dt[i,3] <- ifelse(is.na(dt[i,3]), dt[i,6], dt[i,3])
}

结果:

> dt
       col1      col2 col3     col4      col5 col6
1  23.89811 0.7915678    2 23.89811 0.7915678    2
2  23.54733 0.2586843    0 27.27893 0.2586843    2
3  19.45348 0.9859838    1 16.77836 0.9859838    1
4  17.73251 0.7074819    2 12.23431 0.7568737    1
5  12.01145 0.6445426    0 12.01145 0.9797782    0
6  10.91022 0.2189478    2 29.02549 0.2189478    2
7  23.15049 0.6985436    2 17.59176 0.9487072    3
8  18.61908 0.5440579    2 23.10190 0.1494579    2
9  23.06062 0.2264672    2 23.06062 0.6003570    2
10 15.40339 0.4845578    1 19.18845 0.9464308    1