group_by()和百分比:summarise()删除了我也需要的列-R

时间:2019-02-25 13:21:45

标签: r dataframe group-by dplyr

我有这个df

> df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
+                  Children = sample(0:2, 10, replace = TRUE),
+                  Teens = sample(1:3, 10, replace = TRUE),
+                  stringsAsFactors = FALSE)
> df
   Adults Children Teens
1       5        0     1
2       5        1     2
3       5        2     3
4       5        2     2
5       0        1     2
6       5        1     3
7       0        2     3
8       4        2     1
9       4        0     1
10      1        2     1

我们可以看到Children没有3,4,5值,而Teens没有0,4,5值。但是,我们知道AdultsChildrenTeens可能来自0 to 5

当我将group_by()summarise()一起使用时,summarise会删除我没有分组的列。代码:

df %>%
  group_by(Adults) %>% mutate(n_Adults = n()) %>%
  group_by(Teens) %>% mutate(n_Teens = n()) %>%
  group_by(Children) %>% mutate(n_Children = n()) 

当我按c(0,1,2,3,4,5)分组(以获取所有可能的值)时,出现此错误:

Error in mutate_impl(.data, dots) : Column `c(0, 1, 2, 3, 4, 5)` must be length 10 (the number of rows) or one, not 6 

我正在寻找以下输出:

Values n_Adults n_Children n_Teens p_Adults p_Children p_Teens
     0        2          2       0      0.2        0.2     0               
     1        1          3       4      0.1        0.1     0.4                 
     2        0          5       3      0          0       0.3             
     3        0          0       3      0          0       0.3                
     4        2          0       0      0.2        0.2     0                    
     5        5          0       0      0.5        0.5     0 

其中n_是相应列的计数,p_是相应列的百分比。

2 个答案:

答案 0 :(得分:2)

我们可以将数据gather转换为“长”格式,在将“值”转换为count并将factor指定为0:5的情况下,使用levels获得频率。 ,将spread设置为“宽”格式,并通过除以每列的sum来创建“ p”列,并在需要时更改列名称(使用rename_at

library(tidyverse)
gather(df) %>% 
  count(key, value = factor(value, levels  = 0:5)) %>%
  spread(key, n, fill = 0) %>% 
  mutate_at(2:4, list(p = ~./sum(.)))%>%
  rename_at(2:4, ~ paste0(.x, "_n"))

数据

df <- structure(list(Adults = c(1L, 1L, 4L, 3L, 3L, 5L, 1L, 4L, 4L, 
1L), Children = c(1L, 1L, 2L, 2L, 0L, 2L, 0L, 0L, 1L, 0L), Teens = c(1L, 
2L, 3L, 1L, 1L, 3L, 1L, 2L, 2L, 1L)), class = "data.frame", row.names = c(NA, 
 -10L))

答案 1 :(得分:0)

library(reprex)
library(tidyverse)
set.seed(20)

df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
                                  Children = sample(0:2, 10, replace = TRUE),
                                  Teens = sample(1:3, 10, replace = TRUE),
                                  stringsAsFactors = FALSE)
df
#>    Adults Children Teens
#> 1       5        2     2
#> 2       4        2     1
#> 3       1        0     2
#> 4       3        2     1
#> 5       5        0     1
#> 6       5        1     1
#> 7       0        0     3
#> 8       0        0     3
#> 9       1        0     1
#> 10      2        2     3

 df_adults <- df %>%
  count(Adults) %>%
   rename( n_Adults = n)

df_childred <- df %>%
  count(Children)  %>%
  rename( n_Children = n)


df_teens <- df %>%
  count(Teens)  %>%
  rename( n_Teens = n)


df_new <- data.frame(unique_id = 0:5)


df_new <- left_join(df_new,df_adults, by = c("unique_id"="Adults"))

df_new <- left_join(df_new,df_childred, by = c("unique_id"="Children"))

df_new <- left_join(df_new,df_teens, by = c("unique_id"="Teens"))

df_new <- df_new %>%
          replace_na(list( n_Adults=0, n_Children=0, n_Teens=0))

df_new %>%
  mutate(p_Adults = n_Adults/sum(n_Adults),p_Children = n_Children/sum(n_Children), p_Teens = n_Teens/sum(n_Teens))
#>   unique_id n_Adults n_Children n_Teens p_Adults p_Children p_Teens
#> 1         0        2          5       0      0.2        0.5     0.0
#> 2         1        2          1       5      0.2        0.1     0.5
#> 3         2        1          4       2      0.1        0.4     0.2
#> 4         3        1          0       3      0.1        0.0     0.3
#> 5         4        1          0       0      0.1        0.0     0.0
#> 6         5        3          0       0      0.3        0.0     0.0

reprex package(v0.2.1)于2019-02-25创建