使用dplyr汇总data.frame中的倍数变化

时间:2018-07-07 20:50:47

标签: r dataframe dplyr

我有一个data.frame,其值是从10个group(例如细胞类型)中测得的100个id(例如基因),其中每个group都来自10个family(例如组织),每个id-group-family组合中每个有3个样本,即总共30000行:

set.seed(1)
df <- data.frame(id = rep(paste0("i",1:100),300),
                 group = rep(unlist(lapply(1:10,function(g) rep(paste0("g",g),100))),30),
                 family = unlist(lapply(1:10,function(f) rep(paste0("f",f),3000))),
                 val = rnorm(30000))

我想创建一个data.frame,针对每个id中每个group中的每个family,计算其均值valvalid中所有其他group的平均值family

这就是我现在正在做的,但是我正在寻找一种更快的实现,可以通过dplyr来实现:

ids <- paste0("i",1:100)
groups <- paste0("g",1:10)
families <- paste0("f",1:10)

res.df <- do.call(rbind,lapply(ids,function(i){
  do.call(rbind,lapply(families,function(f){
    do.call(rbind,lapply(groups,function(g){
      data.frame(id=i,group=g,family=f,fc=mean(dplyr::filter(df,id == i,group == g,family == f)$val)/mean(dplyr::filter(df,id != i,group == g,family == f)$val))
    }))
  }))
}))

有什么主意吗?

2 个答案:

答案 0 :(得分:1)

简短答案:

library(data.table)
dfM <- setDT(df)[, mean(val), .(id, group, family)]
cbind(dfM[, outer(V1, V1, "/"), .(group, family)],
      dfM[, expand.grid(id, id), .(group, family)][, .(Var1, Var2)])

说明:

我将以不同的方式解决此任务(无需迭代)。首先,我们必须澄清您的问题:

  1. 为每种validgroup组合计算family的平均值
  2. 对于每种groupfamily组合,将每个平均乘积除以其他平均乘积

要计算均值,我将使用data.table(我也将data.table用于以后每个组的计算),是要多次重新计算均值。

library(data.table)
dfM <- setDT(df)[, mean(val), .(id, group, family)]
# Result
# head(dfM)
#    id group family          V1
# 1: i1    g1     f1 -0.12587944
# 2: i2    g1     f1 -0.20889324
# 3: i3    g1     f1 -0.02890183
# 4: i4    g1     f1  0.77509410
# 5: i5    g1     f1  0.11435116
# 6: i6    g1     f1 -0.59556654

要计算倍数变化(即,将矢量除以矢量),我们可以使用outer函数。在这里,我们要求将V1V1组合中的data.table dfM中的向量group除以向量family

 foo <- dfM[, outer(V1, V1, "/"), .(group, family)]
 # nrow(foo)
 # 1000000
 #    group family         V1
 # 1:    g1     f1  1.0000000
 # 2:    g1     f1  1.6594708
 # 3:    g1     f1  0.2295993
 # 4:    g1     f1 -6.1574322
 # 5:    g1     f1 -0.9084181
 # 6:    g1     f1  4.7312457

outer没有提供有关id的信息,因为我们使用了另一个base R函数expand.grid

bar <- dfM[, expand.grid(id, id), .(group, family)][, .(id1 = Var1, id2 = Var2)]

对于最终结果,请使用cbind

head(cbind(foo, bar))

head(cbind(foo, bar))
#    group family         V1 id1 id2
# 1:    g1     f1  1.0000000  i1  i1
# 2:    g1     f1  1.6594708  i2  i1
# 3:    g1     f1  0.2295993  i3  i1
# 4:    g1     f1 -6.1574322  i4  i1
# 5:    g1     f1 -0.9084181  i5  i1
# 6:    g1     f1  4.7312457  i6  i1

对于给定的OP数据,此解决方案只需几秒钟。

数据:

set.seed(1)
df <- data.frame(id = rep(paste0("i",1:100),300),
                 group = rep(unlist(lapply(1:10,function(g) rep(paste0("g",g),100))),30),
                 family = unlist(lapply(1:10,function(f) rep(paste0("f",f),3000))),
                 val = rnorm(30000))

答案 1 :(得分:1)

对于您的问题不够明确,我同意@PoGibas的观点,只是假设您尝试以有效的方式重现最终数据帧res.df。在这一点上,我相信@PoGibas的答案无法提供您想要的格式,并且某些人可能会发现data.table的语法比dplyr更加不易使用(我并不是要进行比较,两种软件包都有各自的优势)。这是一种可能的dplyr解决方案:

library(dplyr)
# assuming that df and res.df are already loaded as given in the question

by_id_group_family <- df %>%
  # group by id, group and family 
  group_by(id, group, family) %>%
  # calculate some useful features of the grouped data 
  summarise(
    count = n(),
    total_val = sum(val), 
    avg_val = mean(val)  
  )

by_group_family <- df %>% 
  # group by group and family
  group_by(group, family) %>% 
  # calculate some useful features of the grouped data 
  summarise(
    count = n(),
    total_val = sum(val), 
    avg_val = mean(val) 
  )

# store mean vals for each id samples in each group in each family
mean_ids <- by_id_group_family$avg_val

# compute mean vals of all other ids in each group in each family
# note that shorter list will recycle here 
# and we have a minus at the beginning as we are subtracting bigger sum from smaller one
mean_other_ids <- -(by_id_group_family$total_val - by_group_family$total_val) / 297

# computing the ratio of means
ratio <- mean_ids / mean_other_ids

# combining the ratio with the grouped data
result <- by_id_group_family %>%
  # choose only the first three columns
  select(1:3) %>%
  ungroup() %>%
  # add a new column to store ratio
  mutate(fc = ratio)

# note that result has the same info as your res.df but family column is sorted differently
head(result)
# # A tibble: 6 x 4
#   id    group family      fc
#   <fct> <fct> <fct>    <dbl>
# 1 i1    g1    f1        9.48
# 2 i1    g1    f10      -4.86
# 3 i1    g1    f2      -50.4 
# 4 i1    g1    f3       17.2 
# 5 i1    g1    f4      131.  
# 6 i1    g1    f5        4.03

可以通过合并一些步骤并删除额外的计算来使代码更简洁,但我认为这种方式更易于遵循,额外的摘要统计信息有助于我理解数据的本质。