我从主要的每个子组有一个df的营业额。现在我想得到营业额最高的两个主要群体。
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15))
> df
grp sub_grp turnover
1 A 1 98.14430
2 A 2 107.90811
3 A 3 103.93973
4 A 4 95.78222
5 A 5 63.19635
6 B 1 97.85688
7 B 2 92.65572
8 B 3 86.02872
9 B 4 101.88177
10 B 5 120.66959
11 C 1 125.93533
12 C 2 98.49771
13 C 3 77.28770
14 C 4 101.44822
15 C 5 107.08171
16 D 1 77.73252
17 D 2 107.49374
18 D 3 87.46436
19 D 4 101.49984
20 D 5 99.13047
21 E 1 91.48636
22 E 2 115.63716
23 E 3 99.34567
24 E 4 104.65408
25 E 5 121.41820
我知道如何获得营业额最高的两个主要群体,但不知道如何保持我的小组和营业额仍然分组。
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2)
grp total.turnover
(fctr) (dbl)
1 C 510.2507
2 E 532.5415
我想从这个例子中得到结果。
grp sub_grp turnover
1 C 1 125.93533
2 C 2 98.49771
3 C 3 77.28770
4 C 4 101.44822
5 C 5 107.08171
6 E 1 91.48636
7 E 2 115.63716
8 E 3 99.34567
9 E 4 104.65408
10 E 5 121.41820
答案 0 :(得分:3)
以下是dplyr
的两种不同方法。
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2) %>%
inner_join(df, by = "grp") %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
dense_rank
)df %>%
group_by(grp) %>%
mutate(total.turnover = sum(turnover)) %>%
ungroup() %>%
filter(dense_rank(desc(total.turnover)) < 3) %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
data.table
(类似于dplyr
窗口函数方法)library(data.table)
dt <- data.table(df)
dt[,total.turnover := sum(turnover), by = .(grp)
][,rank := frank(-total.turnover, ties.method = "dense")
][rank < 3, .(grp, sub_grp, turnover)]
# grp sub_grp turnover
# 1: A 1 91.59287
# 2: A 2 96.54734
# 3: A 3 123.38062
# 4: A 4 101.05763
# 5: A 5 101.93932
# 6: C 1 118.36123
# 7: C 2 105.39721
# 8: C 3 106.01157
# 9: C 4 101.66024
# 10: C 5 91.66238
library(dplyr)
set.seed(123)
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15)
)
答案 1 :(得分:1)
一个选项是 dplyr ,我们在汇总的输出对象上使用filter
df %>%
filter(grp %in% df1$grp)
其中'df1'是汇总输出对象
或者如果我们想要在同一个链中
df %>%
group_by(grp) %>%
summarise(val = sum(turnover)) %>%
top_n(2) %>%
semi_join(df, .)
# grp sub_grp turnover
#1 C 1 125.93533
#2 C 2 98.49771
#3 C 3 77.28770
#4 C 4 101.44822
#5 C 5 107.08171
#6 E 1 91.48636
#7 E 2 115.63716
#8 E 3 99.34567
#9 E 4 104.65408
#10 E 5 121.41820
或另一个单行选项是 data.table
library(data.table)
setDT(df)[grp %in% df[, sum(turnover), grp][order(-V1), head(grp, 2)]]
# grp sub_grp turnover
# 1: C 1 125.93533
# 2: C 2 98.49771
# 3: C 3 77.28770
# 4: C 4 101.44822
# 5: C 5 107.08171
# 6: E 1 91.48636
# 7: E 2 115.63716
# 8: E 3 99.34567
# 9: E 4 104.65408
#10: E 5 121.41820
或者我们可以使用基础R
轻松完成此操作subset(df, grp %in% names(tail(sort(xtabs(turnover~grp , df)),2)))
# grp sub_grp turnover
#11 C 1 125.93533
#12 C 2 98.49771
#13 C 3 77.28770
#14 C 4 101.44822
#15 C 5 107.08171
#21 E 1 91.48636
#22 E 2 115.63716
#23 E 3 99.34567
#24 E 4 104.65408
#25 E 5 121.41820
df <- structure(list(grp = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "D", "E",
"E", "E", "E", "E"), sub_grp = c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L), turnover = c(98.1443, 107.90811, 103.93973, 95.78222,
63.19635, 97.85688, 92.65572, 86.02872, 101.88177, 120.66959,
125.93533, 98.49771, 77.2877, 101.44822, 107.08171, 77.73252,
107.49374, 87.46436, 101.49984, 99.13047, 91.48636, 115.63716,
99.34567, 104.65408, 121.4182)), .Names = c("grp", "sub_grp",
"turnover"),
class = "data.frame", row.names = c(NA, -25L),
index = structure(integer(0), "`__grp`" = integer(0)))