Как сохранить только самое высокое дублированное значение во фрейме данных? - PullRequest
2 голосов
/ 13 июля 2020

У меня такой код:

library(tidyverse)
astronauts %>% 
  group_by(name, nationality, total_hrs_sum) %>% 
  summarise() 

Результат:

name                                 nationality         total_hrs_sum
<chr>                                <chr>               <dbl>
Acaba, Joseph M.                     U.S.                7272.23        
Acton, Loren Wilbur                  U.S.                190.94     
Adamson, James C.                    U.S.                334.00     
Afanasyev, Viktor Mikhaylovich       U.S.S.R/Russia      13338.55       
Aidyn (Aydyn) Akanovich Aimbetov     Kazakhstan          236.23     
Akers, Thomas D.                     U.S.                814.00     
Akiyama, Toyohiro                    Japan               189.90     
Aksyonov, Vladimir                   U.S.S.R/Russia      284.18     
Al Mansoori, Hazzaa                  UAE                 189.00     
Al-saud, Sultan bin Salman           Saudi Arabia        170.00

Мой вопрос:

Я хотел бы дополнительно отфильтровать этот фрейм данных, чтобы было предоставлено только 1 имя для каждой национальности. Это одно имя для каждой национальности должно иметь наивысшее значение в столбце total_hrs_sum. Мне наиболее удобны решения dplyr, но я открыт и для других возможных решений.

Примеры данных:

structure(list(name = c("Acaba, Joseph M.", "Acton, Loren Wilbur", 
"Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", "Aidyn (Aydyn) Akanovich Aimbetov", 
"Akers, Thomas D.", "Akiyama, Toyohiro", "Aksyonov, Vladimir", 
"Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", "Aldrin, Edwin Eugene, Jr.", 
"Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", "Allen, Andrew M.", 
"Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), total_hrs_sum = c(7272.23, 
190.94, 334, 13338.55, 236.23, 814, 189.9, 284.18, 189, 170, 
289, 47, 7434.03, 904, 314, 1224, 147, 4046, 593, 614.37, 261.525, 
579, 847, 639.5, 206, 188.71, 307, 8784, 3471.35, 377.5, 664, 
5686.82, 190.2, 4722, 17942.23, 338, 686, 965, 4297.28, 5085, 
734, 190.94, 473.75, 169.63, 1671.75, 708, 190, 26.03, 94.83, 
5073.07)), row.names = c(NA, -50L), groups = structure(list(name = c("Acaba, Joseph M.", 
"Acton, Loren Wilbur", "Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", 
"Aidyn (Aydyn) Akanovich Aimbetov", "Akers, Thomas D.", "Akiyama, Toyohiro", 
"Aksyonov, Vladimir", "Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", 
"Aldrin, Edwin Eugene, Jr.", "Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", 
"Allen, Andrew M.", "Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), .rows = structure(list(1L, 
    2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
    27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
    39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))
    

Ответы [ 3 ]

1 голос
/ 13 июля 2020

Вы можете фильтровать следующим образом:

astronauts %>%
  group_by(nationality) %>%
  filter(total_hrs_sum == max(total_hrs_sum))

Посмотрите, что max в кадре данных group_by будет максимумом для каждой группы.

Это результат:

# A tibble: 10 x 3
# Groups:   nationality [10]
   name                              nationality    total_hrs_sum
   <chr>                             <chr>                  <dbl>
 1 Acaba, Joseph M.                  U.S.                   7272.
 2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
 3 Akiyama, Toyohiro                 Japan                   190.
 4 Al Mansoori, Hazzaa               UAE                     189 
 5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
 6 Aleksandrov, Aleksandr            Bulgaria                 47 
 7 André-Deshays, Claudie (Haigneré) France                  614.
 8 Arnaldo Tamayo Mendez             Cuba                    189.
 9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
10 Bella, Ivan                       Slovakia                190 
1 голос
/ 13 июля 2020

Мы можем использовать slice первую строку после группировки по 'национальности' и arrange в 'total_hrs_sum' в порядке убывания

library(dplyr)
df %>%        
    arrange(nationality, desc(total_hrs_sum)) %>%
    group_by(nationality) %>%
    slice(1)

Или используйте top_n

df %>%
    group_by(nationality) %>%
    top_n(n=1, total_hrs_sum)
# A tibble: 10 x 3
# Groups:   nationality [10]
#   name                              nationality    total_hrs_sum
#   <chr>                             <chr>                  <dbl>
# 1 Acaba, Joseph M.                  U.S.                   7272.
# 2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
# 3 Akiyama, Toyohiro                 Japan                   190.
# 4 Al Mansoori, Hazzaa               UAE                     189 
# 5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
# 6 Aleksandrov, Aleksandr            Bulgaria                 47 
# 7 André-Deshays, Claudie (Haigneré) France                  614.
# 8 Arnaldo Tamayo Mendez             Cuba                    189.
# 9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
#10 Bella, Ivan                       Slovakia                190 
1 голос
/ 13 июля 2020

Попробуйте следующее:

library(dplyr)

df %>% group_by(nationality) %>% arrange(desc(total_hrs_sum)) %>% filter(!duplicated(nationality))

# A tibble: 10 x 3
# Groups:   nationality [10]
   name                              nationality    total_hrs_sum
   <chr>                             <chr>                  <dbl>
 1 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
 2 Acaba, Joseph M.                  U.S.                   7272.
 3 André-Deshays, Claudie (Haigneré) France                  614.
 4 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
 5 Bella, Ivan                       Slovakia                190 
 6 Akiyama, Toyohiro                 Japan                   190.
 7 Al Mansoori, Hazzaa               UAE                     189 
 8 Arnaldo Tamayo Mendez             Cuba                    189.
 9 Al-saud, Sultan bin Salman        Saudi Arabia            170 
10 Aleksandrov, Aleksandr            Bulgaria                 47 
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...