r - 为每个变量分别对多个列求和

时间:2016-08-14 23:17:50

标签: r

我有一个包含52列和大约850,000行的数据框。前50列全部编码为是/否。最后2列是数字。我的目标是为50个变量中的每一个加上第51列和第52列。换句话说,按列1和总和列51和52分组,按列2和总和列51和52等分组。只是想知道最好的方法。

3 个答案:

答案 0 :(得分:2)

以下是假数据的示例。在下面的数据中,val2X1类似于您的第51和52列,而X5val1就像您的50个分组列。为了获得val2X1的总和,我们将数据融合为长格式,以便X5library(dplyr) library(reshape2) # Fake data set.seed(5) dat = data.frame(replicate(5,sample(c("Yes","No"),20,replace=TRUE)), val1=rnorm(20), val2=rnorm(20)) 列成为"堆叠"。然后,我们可以轻松地对数据进行分组并生成总和。

    X1  X2  X3  X4  X5        val1        val2
1  Yes  No  No  No  No  1.46324856 -0.20409732
2   No  No Yes Yes  No  0.18772610 -0.22561419
3   No Yes  No  No Yes  1.02202286  0.34702845
...
18  No  No Yes  No  No -0.30170228 -0.47343201
19  No Yes Yes Yes Yes -1.27238344 -0.07577256
20  No Yes Yes Yes Yes -0.27966611 -0.52184006
# Separately sum val1 and val2 by group
dat %>% 
  # Convert to long format
  melt(id.var=c("val1","val2"), variable.name="cols", value.name="group") %>%
  # Sum val1 and val2 by cols and group
  group_by(cols, group) %>%
  summarise_all(sum)
     cols group       val1       val2
1      X1    No -0.4959896  0.1546875
2      X1   Yes -3.0714078  1.7631670
3      X2    No -0.6323905  1.0422942
4      X2   Yes -2.9350069  0.8755603
5      X3    No  1.7915356  0.9180840
6      X3   Yes -5.3589330  0.9997705
7      X4    No  1.3502926 -1.4184550
8      X4   Yes -4.9176900  3.3363096
9      X5    No  0.7452743 -0.5833465
10     X5   Yes -4.3126717  2.5012010
# Sum of val1 + val2 by group
dat %>% 
  # Convert to long format
  melt(id.var=c("val1","val2"), variable.name="cols", value.name="group") %>%
  # Sum val1 and val2 by cols and group
  group_by(cols, group) %>%
  summarise(sum = sum(val1 + val2))
     cols group        sum
1      X1    No -0.3413021
2      X1   Yes -1.3082407
3      X2    No  0.4099037
4      X2   Yes -2.0594465
5      X3    No  2.7096196
6      X3   Yes -4.3591625
7      X4    No -0.0681624
8      X4   Yes -1.5813804
9      X5    No  0.1619278
10     X5   Yes -1.8114707
{{1}}

答案 1 :(得分:1)

以下是使用applytapply的方法:

set.seed(123)
d <- data.frame(replicate(5, sample(0:1, 100, replace=TRUE)),
                replicate(2, rnorm(100)))
names(d) <- c(paste("col", 1:5), "x", "y")

out <- t(apply(d[,1:5], MAR=2, function(z) {
  c(x=tapply(d$x, z, sum), y=tapply(d$y, z, sum))
}))
out
#            x.0       x.1       y.0         y.1
# col 1 2.319715 10.255528 -3.623171  -3.3820568
# col 2 4.385023  8.190221 -9.456567   2.4513395
# col 3 6.576423  5.998820  3.154456 -10.1596830
# col 4 8.063604  4.511640  3.879003 -10.8842309
# col 5 7.140356  5.434888 -6.413942  -0.5912855

答案 2 :(得分:1)

类似的data.table方法:

set.seed(1)
df <- data.frame(replicate(5, sample(c("yes", "no"), 20, replace=TRUE)), 
                 col1 = rnorm(20), col2 = rnorm(20)) 

library(data.table)
# Convert from wide to long
df1 <- melt(setDT(df), id.vars = c("col1","col2"))
# Calculate the sum for the last 2 columns separately 
df2 <- df1[ , lapply(.SD, sum) , by = .(variable, value)]
# Convert back to wide format
dcast(df2, value ~ variable, value.var = c("col1", "col2"))

# value  col1_X1   col1_X2  col1_X3  col1_X4  col1_X5   col2_X1   col2_X2    col2_X3    col2_X4     col2_X5
#1: no  2.130194 -0.936481 4.425493 1.322399 2.942901  2.398278  3.385414 -2.1045187  0.5314497 -1.18833735
#2: yes 3.816474  6.883149 1.521175 4.624269 3.003767 -3.602036 -4.589172  0.9007601 -1.7352083 -0.01542122


# Calculate the sum for the last 2 columns together 
df2 <- df1[ , sum(unlist(.SD)) , by = .(variable, value)]
dcast(df2, value ~ variable, value.var = "V1")

#   value        X1       X2       X3       X4       X5
#1:    no 4.5284717 2.448933 2.320974 1.853849 1.754564
#2:   yes 0.2144379 2.293977 2.421935 2.889061 2.988346

@Frank的建议,

# Result 1

df1 <- melt(setDT(df), id.vars = c("col1","col2"))
dcast(df1, value ~ variable, value.var = c("col1", "col2"), fun = sum)

# Result 2

df1 <- melt(setDT(df), id.vars = c("col1","col2"))
dcast(melt(df1, id = c("variable", "value")), value ~ variable, 
      value.var = "value.1", fun = sum)