Question

假设我有一个包含产品ID的列和一个包含其特征的数据框列表：

捆绑数据框

 bundle
1  284993459
2 1048768805
3  511310430
4 1034630958
5 1235581326

d2 list

[[1]]
    id value
1   35   0.2
2 1462   0.2
3 1109   0.2
4  220   0.2
5  211   0.1

[[2]]
list()

[[3]]
    id name value
1  394        0.5
2 1462        0.5

[[4]]
    id name value
1  926        0.3
2 1462        0.3
3  381        0.3
4  930        0.2

[[5]]
    id name value
1  926        0.5
2 1462        0.5

我需要为每个产品创建包含所有特征ID及其值的列。

bundle =  data.frame(bundle =  c(284993459,1048768805,511310430,1034630958,1235581326))
d2<- list(data.frame(id = c(35,1462,1109,220,211), value = c(0.2, 0.2, 0.2,0.2,0.1)), 
                    data.frame(id = NULL, value = NULL), 
                    data.frame(id = c(394,1462), value = c(0.5,0.5)),
                    data.frame(id = c(926,1462,381,930), value = c(0.3,0.3,0.3,0.2)),
                    data.frame(id = c(926,1462), value = c(0.5,0.5))) 

         bundle    35 1462 1109 220 211 394 1462
    1  284993459   0.2  0.2  0.2 0.2 0.1   0    0
    2 1048768805     0    0    0   0   0   0    0
    3  511310430     0    0    0   0   0 0.5  0.5

无法弄清楚如何做到这一点。有想法取消列出这个数据框列表，但没有好处，因为有超过8000个prodict ID：

for (i in seq(d2))
  assign(paste0("df", i), d2[[i]])

如果我们采用与我不同的方法来连接转置特征数据帧，那么这些值将逐行填充。

Answer 1

这是一个tidyverse解决方案。首先，我们向所有data.frames添加一个捆绑列，然后使用purr::map2_dfr将它们拼接在一起，然后使用tidyr::spread格式化为宽。

library(tidyverse)
res <- map2_dfr(bundle$bundle,d2,~mutate(.y,bundle=.x)) %>%
  spread(id,value,)
res[is.na(res)] <- 0
#       bundle  35 211 220 381 394 926 930 1109 1462
# 1  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
# 2  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
# 3 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
# 4 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

Answer 2

您可以先将捆绑包添加到列表中的每个data.frame，然后使用reshape2::dcast或data.table::dcast将其透视，然后再将NAs更新为0

ans <- data.table::dcast(
        do.call(rbind, Map(function(nm, DF) within(DF, bundle <- nm), bundle$bundle, d2)),
    bundle ~ id)
ans[is.na(ans)] <- 0
ans

#      bundle  35 211 220 381 394 926 930 1109 1462
#1  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
#2  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
#3 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
#4 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

编辑：在OP的评论后添加更多解释

1）function(nm, DF) within(DF, bundle <- nm)获取输入data.frame DF并添加一个名为bundle的新列，其值等于nm。

2）Map将函数应用于给定向量的相应元素。（请参阅?Map）这意味着Map使用每个bundle值应用上述函数，并将它们添加到d2

中的每个data.frame

Answer 3

另一种方法可能是

library(data.table)
library(tidyverse)

df <- rbindlist(
  lapply(lapply(d2, function(x) if(nrow(x)==0) data.frame(id=NA, value=NA) else x),  #in case there is no dataframe row in a list assign a blank dataframe
         function(y) y %>% spread(id, value)), #convert all dataframes in wide format
  fill = T) %>%                                #rbind all dataframe in a single dataframe
  select(-`<NA>`) %>%
  cbind.data.frame(bundle = bundle$bundle)

输出为：

    35 211 220 1109 1462 394 381 926 930     bundle
1: 0.2 0.1 0.2  0.2  0.2  NA  NA  NA  NA  284993459
2:  NA  NA  NA   NA   NA  NA  NA  NA  NA 1048768805
3:  NA  NA  NA   NA  0.5 0.5  NA  NA  NA  511310430
4:  NA  NA  NA   NA  0.3  NA 0.3 0.3 0.2 1034630958
5:  NA  NA  NA   NA  0.5  NA  NA 0.5  NA 1235581326

示例数据：

bundle <-  data.frame(bundle =  c(284993459,1048768805,511310430,1034630958,1235581326))
d2 <- list(data.frame(id = c(35,1462,1109,220,211), value = c(0.2, 0.2, 0.2,0.2,0.1)), 
           data.frame(id = NULL, value = NULL), 
           data.frame(id = c(394,1462), value = c(0.5,0.5)),
           data.frame(id = c(926,1462,381,930), value = c(0.3,0.3,0.3,0.2)),
           data.frame(id = c(926,1462), value = c(0.5,0.5)))

Answer 4

有两种可能的方法仅在操作顺序上有所不同：

将列表中的所有数据帧分别从长格式和rbind()匹配列重新整形。
rbind()以长格式显示所有数据框，然后重新整理为宽格式。

这两种方法都需要以某种方式包含bundle。

为了完整起见，以下是使用data.table的第二种方法的不同实现。

library(data.table)
library(magrittr)
d2 %>% 
  # bind row-wise into large data.table, create id column
  rbindlist(idcol = "bid") %>% 
  # right join to append bundle column
  setDT(bundle)[, bid := .I][., on = "bid"] %>%
  # reshape from long to wide format
  dcast(., bundle ~ id, fill = 0)

       bundle  35 211 220 381 394 926 930 1109 1462
1:  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
2:  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
3: 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
4: 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

这里，管道仅用于可视化函数调用的顺序。随着data.table链接，语句变得更加简洁：

library(data.table) # library(magrittr) not required
setDT(bundle)[, bid := .I][
  rbindlist(d2, id = "bid"), on = "bid"][, dcast(.SD, bundle ~ id, fill = 0)]

或

library(data.table) # library(magrittr) not required
dcast(setDT(bundle)[, bid := .I][
  rbindlist(d2, id = "bid"), on = "bid"], bundle ~ id, fill = 0)

另一个变体是在调用rbindlist()之前重命名列表元素，这将使用名称来创建idcol：

library(data.table)
library(magrittr)
d2 %>% 
  # rename list elements
  setNames(bundle$bundle) %>%
  # bind row-wise into large data.table, create id column from element names
  rbindlist(idcol = "bundle") %>% 
  # convert bundle from character to factor to maintain original order
  .[, bundle := forcats::fct_inorder(bundle)] %>%
  # reshape from long to wide format
  dcast(., bundle ~ id, fill = 0)

       bundle  35 211 220 381 394 926 930 1109 1462
1:  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
2:  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
3: 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
4: 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

请注意，到目前为止提供的变体已跳过属于bundle 1048768805的空数据框（同样是Moody_Mudskipper和chinsoon12的答案）。

为了将空数据帧保留在最终结果中，必须更改连接的顺序，以便保留bundle的所有行：

library(data.table)
dcast(
  rbindlist(d2, id = "bid")[setDT(bundle)[, bid := .I], on = "bid"], 
  bundle ~ id, fill = 0
  )[, "NA" := NULL][]

       bundle  35 211 220 381 394 926 930 1109 1462
1:  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
2:  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
3: 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
4: 1048768805 0.0 0.0 0.0 0.0 0.0 0.0 0.0  0.0  0.0
5: 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

或者，如果要保持bundle的确切顺序：

library(data.table)
dcast(
  rbindlist(d2, id = "bid")[setDT(bundle)[, bid := .I], on = "bid"], 
  bid + bundle ~ id, fill = 0
)[, c("bid", "NA") := NULL][]

       bundle  35 211 220 381 394 926 930 1109 1462
1:  284993459 0.2 0.1 0.2 0.0 0.0 0.0 0.0  0.2  0.2
2: 1048768805 0.0 0.0 0.0 0.0 0.0 0.0 0.0  0.0  0.0
3:  511310430 0.0 0.0 0.0 0.0 0.5 0.0 0.0  0.0  0.5
4: 1034630958 0.0 0.0 0.0 0.3 0.0 0.3 0.2  0.0  0.3
5: 1235581326 0.0 0.0 0.0 0.0 0.0 0.5 0.0  0.0  0.5

从数据帧列表中创建一个命名表

捆绑数据框

d2 list

4 个答案: