我有以下代码:
library(tidyverse)
astronauts %>%
group_by(name, nationality, total_hrs_sum) %>%
summarise()
输出:
name nationality total_hrs_sum
<chr> <chr> <dbl>
Acaba, Joseph M. U.S. 7272.23
Acton, Loren Wilbur U.S. 190.94
Adamson, James C. U.S. 334.00
Afanasyev, Viktor Mikhaylovich U.S.S.R/Russia 13338.55
Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.23
Akers, Thomas D. U.S. 814.00
Akiyama, Toyohiro Japan 189.90
Aksyonov, Vladimir U.S.S.R/Russia 284.18
Al Mansoori, Hazzaa UAE 189.00
Al-saud, Sultan bin Salman Saudi Arabia 170.00
我的问题
我想进一步过滤此数据框,以便每个国籍仅提供1个姓名。每个国籍的一个名称在total_hrs_sum列中应具有最高的值。我对dplyr解决方案最满意,但也欢迎其他可能的解决方案。
样本数据:
structure(list(name = c("Acaba, Joseph M.", "Acton, Loren Wilbur",
"Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", "Aidyn (Aydyn) Akanovich Aimbetov",
"Akers, Thomas D.", "Akiyama, Toyohiro", "Aksyonov, Vladimir",
"Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", "Aldrin, Edwin Eugene, Jr.",
"Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", "Allen, Andrew M.",
"Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison",
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)",
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.",
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II",
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri",
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena",
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.",
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.",
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick",
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel",
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.",
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan",
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria",
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.",
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia",
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.",
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia",
"U.S.S.R/Russia", "U.S.S.R/Russia"), total_hrs_sum = c(7272.23,
190.94, 334, 13338.55, 236.23, 814, 189.9, 284.18, 189, 170,
289, 47, 7434.03, 904, 314, 1224, 147, 4046, 593, 614.37, 261.525,
579, 847, 639.5, 206, 188.71, 307, 8784, 3471.35, 377.5, 664,
5686.82, 190.2, 4722, 17942.23, 338, 686, 965, 4297.28, 5085,
734, 190.94, 473.75, 169.63, 1671.75, 708, 190, 26.03, 94.83,
5073.07)), row.names = c(NA, -50L), groups = structure(list(name = c("Acaba, Joseph M.",
"Acton, Loren Wilbur", "Adamson, James C.", "Afanasyev, Viktor Mikhaylovich",
"Aidyn (Aydyn) Akanovich Aimbetov", "Akers, Thomas D.", "Akiyama, Toyohiro",
"Aksyonov, Vladimir", "Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman",
"Aldrin, Edwin Eugene, Jr.", "Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr",
"Allen, Andrew M.", "Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison",
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)",
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.",
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II",
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri",
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena",
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.",
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.",
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick",
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel",
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.",
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan",
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria",
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.",
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.",
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia",
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.",
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia",
"U.S.S.R/Russia", "U.S.S.R/Russia"), .rows = structure(list(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
答案 0 :(得分:1)
尝试一下:
library(dplyr)
df %>% group_by(nationality) %>% arrange(desc(total_hrs_sum)) %>% filter(!duplicated(nationality))
# A tibble: 10 x 3
# Groups: nationality [10]
name nationality total_hrs_sum
<chr> <chr> <dbl>
1 Avdeyev, Sergei U.S.S.R/Russia 17942.
2 Acaba, Joseph M. U.S. 7272.
3 André-Deshays, Claudie (Haigneré) France 614.
4 Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.
5 Bella, Ivan Slovakia 190
6 Akiyama, Toyohiro Japan 190.
7 Al Mansoori, Hazzaa UAE 189
8 Arnaldo Tamayo Mendez Cuba 189.
9 Al-saud, Sultan bin Salman Saudi Arabia 170
10 Aleksandrov, Aleksandr Bulgaria 47
答案 1 :(得分:1)
您可以这样过滤:
astronauts %>%
group_by(nationality) %>%
filter(total_hrs_sum == max(total_hrs_sum))
请注意,max
数据帧中的group_by
将是每个组的最大值。
这是输出:
# A tibble: 10 x 3
# Groups: nationality [10]
name nationality total_hrs_sum
<chr> <chr> <dbl>
1 Acaba, Joseph M. U.S. 7272.
2 Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.
3 Akiyama, Toyohiro Japan 190.
4 Al Mansoori, Hazzaa UAE 189
5 Al-saud, Sultan bin Salman Saudi Arabia 170
6 Aleksandrov, Aleksandr Bulgaria 47
7 André-Deshays, Claudie (Haigneré) France 614.
8 Arnaldo Tamayo Mendez Cuba 189.
9 Avdeyev, Sergei U.S.S.R/Russia 17942.
10 Bella, Ivan Slovakia 190
答案 2 :(得分:1)
在按“国籍”分组并slice
按降序排列“ total_hrs_sum”之后,我们可以在第一行使用arrange
library(dplyr)
df %>%
arrange(nationality, desc(total_hrs_sum)) %>%
group_by(nationality) %>%
slice(1)
或使用top_n
df %>%
group_by(nationality) %>%
top_n(n=1, total_hrs_sum)
# A tibble: 10 x 3
# Groups: nationality [10]
# name nationality total_hrs_sum
# <chr> <chr> <dbl>
# 1 Acaba, Joseph M. U.S. 7272.
# 2 Aidyn (Aydyn) Akanovich Aimbetov Kazakhstan 236.
# 3 Akiyama, Toyohiro Japan 190.
# 4 Al Mansoori, Hazzaa UAE 189
# 5 Al-saud, Sultan bin Salman Saudi Arabia 170
# 6 Aleksandrov, Aleksandr Bulgaria 47
# 7 André-Deshays, Claudie (Haigneré) France 614.
# 8 Arnaldo Tamayo Mendez Cuba 189.
# 9 Avdeyev, Sergei U.S.S.R/Russia 17942.
#10 Bella, Ivan Slovakia 190