如何仅在数据框中保留最高重复值?

时间:2020-07-13 19:54:12

标签: r dataframe dplyr tidyverse

我有以下代码:

library(tidyverse)
astronauts %>% 
  group_by(name, nationality, total_hrs_sum) %>% 
  summarise() 

输出:

name                                 nationality         total_hrs_sum
<chr>                                <chr>               <dbl>
Acaba, Joseph M.                     U.S.                7272.23        
Acton, Loren Wilbur                  U.S.                190.94     
Adamson, James C.                    U.S.                334.00     
Afanasyev, Viktor Mikhaylovich       U.S.S.R/Russia      13338.55       
Aidyn (Aydyn) Akanovich Aimbetov     Kazakhstan          236.23     
Akers, Thomas D.                     U.S.                814.00     
Akiyama, Toyohiro                    Japan               189.90     
Aksyonov, Vladimir                   U.S.S.R/Russia      284.18     
Al Mansoori, Hazzaa                  UAE                 189.00     
Al-saud, Sultan bin Salman           Saudi Arabia        170.00

我的问题

我想进一步过滤此数据框,以便每个国籍仅提供1个姓名。每个国籍的一个名称在total_hrs_sum列中应具有最高的值。我对dplyr解决方案最满意,但也欢迎其他可能的解决方案。

样本数据:

structure(list(name = c("Acaba, Joseph M.", "Acton, Loren Wilbur", 
"Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", "Aidyn (Aydyn) Akanovich Aimbetov", 
"Akers, Thomas D.", "Akiyama, Toyohiro", "Aksyonov, Vladimir", 
"Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", "Aldrin, Edwin Eugene, Jr.", 
"Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", "Allen, Andrew M.", 
"Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), total_hrs_sum = c(7272.23, 
190.94, 334, 13338.55, 236.23, 814, 189.9, 284.18, 189, 170, 
289, 47, 7434.03, 904, 314, 1224, 147, 4046, 593, 614.37, 261.525, 
579, 847, 639.5, 206, 188.71, 307, 8784, 3471.35, 377.5, 664, 
5686.82, 190.2, 4722, 17942.23, 338, 686, 965, 4297.28, 5085, 
734, 190.94, 473.75, 169.63, 1671.75, 708, 190, 26.03, 94.83, 
5073.07)), row.names = c(NA, -50L), groups = structure(list(name = c("Acaba, Joseph M.", 
"Acton, Loren Wilbur", "Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", 
"Aidyn (Aydyn) Akanovich Aimbetov", "Akers, Thomas D.", "Akiyama, Toyohiro", 
"Aksyonov, Vladimir", "Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", 
"Aldrin, Edwin Eugene, Jr.", "Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", 
"Allen, Andrew M.", "Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), .rows = structure(list(1L, 
    2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
    27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
    39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))
    

3 个答案:

答案 0 :(得分:1)

尝试一下:

library(dplyr)

df %>% group_by(nationality) %>% arrange(desc(total_hrs_sum)) %>% filter(!duplicated(nationality))

# A tibble: 10 x 3
# Groups:   nationality [10]
   name                              nationality    total_hrs_sum
   <chr>                             <chr>                  <dbl>
 1 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
 2 Acaba, Joseph M.                  U.S.                   7272.
 3 André-Deshays, Claudie (Haigneré) France                  614.
 4 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
 5 Bella, Ivan                       Slovakia                190 
 6 Akiyama, Toyohiro                 Japan                   190.
 7 Al Mansoori, Hazzaa               UAE                     189 
 8 Arnaldo Tamayo Mendez             Cuba                    189.
 9 Al-saud, Sultan bin Salman        Saudi Arabia            170 
10 Aleksandrov, Aleksandr            Bulgaria                 47 

答案 1 :(得分:1)

您可以这样过滤:

astronauts %>%
  group_by(nationality) %>%
  filter(total_hrs_sum == max(total_hrs_sum))

请注意,max数据帧中的group_by将是每个组的最大值。

这是输出:

# A tibble: 10 x 3
# Groups:   nationality [10]
   name                              nationality    total_hrs_sum
   <chr>                             <chr>                  <dbl>
 1 Acaba, Joseph M.                  U.S.                   7272.
 2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
 3 Akiyama, Toyohiro                 Japan                   190.
 4 Al Mansoori, Hazzaa               UAE                     189 
 5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
 6 Aleksandrov, Aleksandr            Bulgaria                 47 
 7 André-Deshays, Claudie (Haigneré) France                  614.
 8 Arnaldo Tamayo Mendez             Cuba                    189.
 9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
10 Bella, Ivan                       Slovakia                190 

答案 2 :(得分:1)

在按“国籍”分组并slice按降序排列“ total_hrs_sum”之后,我们可以在第一行使用arrange

library(dplyr)
df %>%        
    arrange(nationality, desc(total_hrs_sum)) %>%
    group_by(nationality) %>%
    slice(1)

或使用top_n

df %>%
    group_by(nationality) %>%
    top_n(n=1, total_hrs_sum)
# A tibble: 10 x 3
# Groups:   nationality [10]
#   name                              nationality    total_hrs_sum
#   <chr>                             <chr>                  <dbl>
# 1 Acaba, Joseph M.                  U.S.                   7272.
# 2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
# 3 Akiyama, Toyohiro                 Japan                   190.
# 4 Al Mansoori, Hazzaa               UAE                     189 
# 5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
# 6 Aleksandrov, Aleksandr            Bulgaria                 47 
# 7 André-Deshays, Claudie (Haigneré) France                  614.
# 8 Arnaldo Tamayo Mendez             Cuba                    189.
# 9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
#10 Bella, Ivan                       Slovakia                190