full_join добавление дополнительных строк и NA в r - PullRequest
0 голосов
/ 05 августа 2020

У меня есть два фрейма данных, к которым я пытаюсь присоединиться с помощью full_join, вот часть моих данных:

df1 <- structure(list(Team = structure(c(4L, 3L, 5L, 6L, 7L, 7L, 8L, 
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 14L, 13L, 15L, 15L, 
16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 22L, 23L, 
23L, 24L, 24L, 25L, 25L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 
32L, 32L, 33L, 33L, 34L, 34L, 2L, 1L, 26L, 27L), .Label = c("76ers", 
"76ers ", "Bucks", "Bucks ", "Bull ", "Bulls ", "Cavaliers ", 
"Celtics ", "Clippers ", "Grizzlies ", "Hawks ", "Heat ", "Hornets", 
"Hornets ", "Jazz ", "Kings ", "Knicks ", "Lakers ", "Magic ", 
"Mavericks ", "Net ", "Nets ", "Nuggets ", "Pacers ", "Pelicans ", 
"Pistons", "Pistons ", "Raptors ", "Rockets ", "Spurs ", "Thunder ", 
"Timberwolves ", "Warriors ", "Wizards "), class = "factor"), 
    Injury.Count = c(3L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 1L, 
    0L, 2L, 1L, 0L, 5L, 4L, 3L, 2L, 3L, 0L, 3L, 3L, 4L, 6L, 5L, 
    0L, 2L, 2L, 1L, 2L, 0L, 1L, 3L, 4L, 2L, 6L, 2L, 1L, 1L, 1L, 
    3L, 3L, 4L, 5L, 1L, 6L, 4L, 2L, 0L, 2L, 2L, 1L, 5L, 6L, 1L, 
    1L), HomevsAway = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c(NA, 
-56L), class = "data.frame")

df2 <- structure(list(Team = structure(c(1L, 1L, 2L, 2L, 3L, 4L, 4L, 
5L, 6L, 7L, 8L, 9L, 9L, 10L, 10L, 11L, 12L, 12L, 13L, 13L, 14L, 
15L, 15L, 16L, 16L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 
22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 28L, 28L, 
3L, 5L, 6L, 7L, 8L, 11L, 14L, 17L, 27L), .Label = c("76ers", 
"Bucks", "Bulls", "Cavaliers", "Celtics", "Clippers", "Grizzlies", 
"Hawks", "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", 
"Magic", "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", 
"Pistons", "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves", 
"Warriors", "Wizards"), class = "factor"), HomevsAway = structure(c(1L, 
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 
1L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
    t_1 = c(55.883, 140.1, 32.2, 37.967, 29.85, 24.317, 57.316, 
    17.967, 19.05, 36.95, 16.167, 95.317, 86.533, 21.334, 52.567, 
    40.75, 28.3, 68.15, 97.067, 102.233, 26.866, 71.033, 34.467, 
    24.233, 42.033, 22.433, 59.033, 41.516, 12.7, 107.996, 6.5, 
    32.783, 0, 23.217, 13.93, 0, 54.88, 23.617, 83.834, 106.794, 
    17.56, 27.76, 85.83, 0.017, 35.183, 22.467, 25.033, 0, 0, 
    0, 0, 0, 0, 0, 0, 0), t_3 = c(197.3164, 388.6827, 126.2663, 
    111.916, 61.95, 91.55, 167.067, 104.083, 71.067, 135.383, 
    45.633, 261.317, 267.399, 114.6997, 159.2, 152.034, 84.8337, 
    204.3003, 351.449, 376.317, 86.333, 213.9, 99.767, 65.1, 
    131.767, 73.317, 126.416, 129.066, 73.383, 347.0994, 4761, 
    113.367, 0, 89.933, 59.8, 0, 188.983, 124.384, 215.666, 289.9667, 
    92, 144.2497, 254.083, 32.0333, 122.1837, 102.533, 82.817, 
    0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -56L), groups = structure(list(
    Team = structure(1:28, .Label = c("76ers", "Bucks", "Bulls", 
    "Cavaliers", "Celtics", "Clippers", "Grizzlies", "Hawks", 
    "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", "Magic", 
    "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", "Pistons", 
    "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves", 
    "Warriors", "Wizards"), class = "factor"), .rows = structure(list(
        1:2, 3:4, c(5L, 48L), 6:7, c(8L, 49L), c(9L, 50L), c(10L, 
        51L), c(11L, 52L), 12:13, 14:15, c(16L, 53L), 17:18, 
        19:20, c(21L, 54L), 22:23, 24:25, c(26L, 55L), 27:28, 
        29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42, 43:44, 
        c(45L, 56L), 46:47), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, 28L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

Я попытался соединить данные с помощью full_join:

LR <- full_join(df1, df2, by = c("HomevsAway", "Team"))

Результат LR, который я получаю, дает мне дополнительные строки со случайными NA. Мой ожидаемый результат должен быть таблицей 56rowx5col.

1 Ответ

1 голос
/ 05 августа 2020

Проблема в том, что в вашем первом data.frame названия команд имеют конечный пробел. Это означает, что вместо строки "Bucks" у вас есть строка "Bucks ". Эти две строки нельзя объединить.

Вот как вы можете исправить свои данные. Сначала удалите начальные пробелы, используя sub, а затем преобразуйте команду в вектор символов. Затем full_join работает, как и планировалось:

df1_new <- df1 %>% as_tibble() %>%
  mutate(Team = sub(" +", "", as.character(Team)))
df2_new <- df2 %>%
  mutate(Team = as.character(Team))

df1_new %>% full_join(df2_new, by = c("Team", "HomevsAway"))

# A tibble: 58 x 5
   Team      Injury.Count HomevsAway   t_1   t_3
   <chr>            <int> <fct>      <dbl> <dbl>
 1 Bucks                3 0           32.2 126. 
 2 Bucks                3 1           38.0 112. 
 3 Bull                 1 0           NA    NA  
 4 Bulls                1 1            0     0  
 5 Cavaliers            1 0           24.3  91.6
 6 Cavaliers            2 1           57.3 167. 
 7 Celtics              0 0            0     0  
 8 Celtics              2 1           18.0 104. 
 9 Clippers             1 0            0     0  
10 Clippers             1 1           19.0  71.1

Обратите внимание, что еще есть несколько NA. Это связано с некоторыми опечатками: Bull vs Bulls и Net vs Nets.

...