找到缺少的条目 - 通过消除嵌套循环来提高效率

时间:2017-12-08 01:38:20

标签: r data.table

我想消除用于在函数内部m中创建数据的嵌套循环。 missing_entries()函数正在尝试识别并返回col1中的组col2丢失的值。有没有其他方法可以解决这个问题,以提高效率?

missing_entries <- function( data, x, y )
{
  # find missing entries in x for the group in y
  # by comparing other groups in y
  require( 'data.table' )

  # require class of data to be data.table
  stopifnot( "data.table" %in% class(data) ) 

  # the outer loop with i refers to each unique value of column y
  # the inner loop with j refers to all unique values of column y
  # except the value in i under current iteration
  uniq_col2 <- unique( data[, get(y) ] )
  m <- lapply( uniq_col2, function(i){
    lapply( setdiff( uniq_col2, i ), function( j ) {
      z <- setdiff( data[ get(y) == i, get(x)], data[ get(y) == j, get(x)])
      if( length(z) > 0 ){
        return( data.frame( v1 = z, v2 = j,
                            stringsAsFactors = FALSE ) )
      } else{
        return()
      }
    } )
  })

  # row bind
  m <- lapply( m, function(k) {
    if(!is.null(k)) {
      rbindlist(l = k)
    }})

  # collect only not null data
  m <- rbindlist( l = m[lapply(m, nrow) > 0] )
  colnames( m ) <- c( x, y )

  return( m )
}

# testing
test_data <- structure(list(cardnty = c("many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many",
                                        "many", "many", "many", "many", "many", "many"), 
                            col1 = c(2L, 4L, 3L, 13L, 5L, 6L, 7L, 17L, 9L, 4L, 3L, 
                                           2L, 8L, 5L, 6L, 7L, 14L, 17L, 19L, 13L, 
                                           9L, 12L, 11L, 20L), 
                            N = c(599L, 43L, 111L, 12L, 11L, 5L, 4L, 
                                  8L, 2L, 72L, 230L, 617L, 13L, 58L, 19L, 9L, 
                                  5L, 3L, 2L, 1L, 11L, 1L, 1L, 1L), 
                            col2 = c("cat", "cat", "cat", "cat", "cat", 
                                   "cat", "cat", "cat", "cat", "dog",  
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog", 
                                   "dog", "dog", "dog",
                                   "dog", "dog" )), 
                       .Names = c("cardnty", "col1", "N", "col2"), 
                       row.names = c(NA, -24L),
                       class = "data.frame")

require('data.table')
setDT(test_data)
results <- missing_entries(data = test_data, x = "col1", y = "col2")
setDF(results)

test_results <- structure(list(col1 = c(8L, 14L, 19L, 12L, 11L, 20L), 
                               col2 = c("cat", "cat", "cat", "cat", "cat", "cat")), 
                          .Names = c("col1", "col2"),
                          row.names = c(NA, -6L), 
                          class = "data.frame")

identical( results, test_results)
# TRUE

2 个答案:

答案 0 :(得分:1)

这里有一个建议:使用dcast将数据帧转换为宽,将0填入缺失值,然后转换回long并拉出零。

library(reshape2)
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]

时间比较表明这会更快一些。

start_time <- Sys.time()
for (x in c(1:10000)){
  results <- missing_entries(data = test_data, x = "col1", y = "col2")
  setDF(results)
}
end_time <- Sys.time()
timeA <- end_time-start_time
# Time difference of 1.725317 mins


start_time <- Sys.time()
for (x in c(1:10000)){
df <- dcast(test_data,col1 ~ col2 ,fill=0,value.var="col2",fun.aggregate=length)
df2 <- melt(df,id.vars="col1")
results <- df2[which(df2$value==0),c("col1","variable")]
}
end_time <- Sys.time()
timeB <- end_time-start_time
# Time difference of 1.368845 mins

答案 1 :(得分:1)

据我所知,OP正在寻找col1col2test_data的缺失组合。

我们可以使用col1&#39; s col2交叉加入)或{获得data.tableCJ()的所有独特组合从基础R {1}}然后我们可以使用反连接找到缺少的元素,它会删除已存在的组合。

expand.grid()
library(data.table)
setDT(test_data)[, CJ(col1 = col1, col2 = col2, unique = TRUE)][
  !test_data, on = .(col1, col2)]