Объединение фреймов данных различной длины, но с общим именем столбца - PullRequest
0 голосов
/ 30 ноября 2018

Пример данных

df2 = structure(list(Country = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 25L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L, 130L,
131L, 132L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 140L, 141L,
142L, 143L, 144L, 145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L,
153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 163L,
164L, 165L, 166L), .Label = c(" Afghanistan", " Albania", " Algeria",
" Angola", " Argentina", " Armenia", " Australia", " Austria",
" Azerbaijan", " Bahamas", " Bangladesh", " Barbados", " Belarus",
" Belgium", " Belize", " Benin", " Bhutan", " Bolivia (Plurinational State of)",
" Bosnia and Herzegovina", " Botswana", " Brazil", " Bulgaria",
" Burkina Faso", " Burundi", " C\xaate d'Ivoire", " Cabo Verde",
" Cambodia", " Cameroon", " Canada", " Central African Republic",
" Chad", " Chile", " China", " Colombia", " Comoros", " Congo",
" Congo (Democratic Republic of the)", " Costa Rica", " Croatia",
" Cyprus", " Czechia", " Denmark", " Djibouti", " Dominican Republic",
" Ecuador", " Egypt", " El Salvador", " Estonia", " Eswatini (Kingdom of)",
" Ethiopia", " Fiji", " Finland", " France", " Gabon", " Gambia",
" Georgia", " Germany", " Ghana", " Greece", " Guatemala", " Guinea",
" Guinea-Bissau", " Guyana", " Haiti", " Honduras", " Hong Kong, China (SAR)",
" Hungary", " Iceland", " India", " Indonesia", " Iran (Islamic Republic of)",
" Iraq", " Ireland", " Israel", " Italy", " Jamaica", " Japan",
" Jordan", " Kazakhstan", " Kenya", " Kiribati", " Korea (Republic of)",
" Kyrgyzstan", " Lao People's Democratic Republic", " Latvia",
" Lebanon", " Lesotho", " Liberia", " Lithuania", " Luxembourg",
" Madagascar", " Malawi", " Malaysia", " Maldives", " Mali",
" Malta", " Mauritania", " Mauritius", " Mexico", " Micronesia (Federated States of)",
" Moldova (Republic of)", " Mongolia", " Montenegro", " Morocco",
" Mozambique", " Myanmar", " Namibia", " Nepal", " Netherlands",
" New Zealand", " Nicaragua", " Niger", " Nigeria", " Norway",
" Pakistan", " Palau", " Palestine, State of", " Panama", " Paraguay",
" Peru", " Philippines", " Poland", " Portugal", " Romania",
" Russian Federation", " Rwanda", " Saint Lucia", " Sao Tome and Principe",
" Senegal", " Serbia", " Sierra Leone", " Singapore", " Slovakia",
" Slovenia", " Solomon Islands", " South Africa", " South Sudan",
" Spain", " Sri Lanka", " Sudan", " Suriname", " Sweden", " Switzerland",
" Syrian Arab Republic", " Tajikistan", " Tanzania (United Republic of)",
" Thailand", " The former Yugoslav Republic of Macedonia", " Timor-Leste",
" Togo", " Trinidad and Tobago", " Tunisia", " Turkey", " Turkmenistan",
" Uganda", " Ukraine", " United Kingdom", " United States", " Uruguay",
" Uzbekistan", " Vanuatu", " Venezuela (Bolivarian Republic of)",
" Viet Nam", " Yemen", " Zambia", " Zimbabwe"), class = "factor"),
    ineq.HDI = c(0.463, 0.741, 0.729, 0.52, 0.813, 0.728, 0.923,
    0.895, 0.74, 0.789, 0.545, 0.782, 0.792, 0.903, 0.699, 0.473,
    0.566, 0.649, 0.713, 0.66, 0.727, 0.779, 0.375, 0.395, 0.629,
    0.537, 0.506, 0.902, 0.351, 0.371, 0.808, 0.706, 0.719, 0.482,
    0.557, 0.407, 0.754, 0.808, 0.85, 0.862, 0.442, 0.91, 0.449,
    0.703, 0.715, 0.665, 0.671, 0.845, 0.538, 0.412, 0.711, 0.903,
    0.882, 0.665, 0.441, 0.735, 0.921, 0.554, 0.856, 0.611, 0.404,
    0.426, 0.63, 0.47, 0.596, 0.901, 0.823, 0.891, 0.581, 0.661,
    0.755, 0.649, 0.909, 0.887, 0.87, 0.712, 0.885, 0.728, 0.765,
    0.543, 0.59, 0.884, 0.636, 0.546, 0.816, 0.758, 0.493, 0.407,
    0.824, 0.889, 0.504, 0.441, 0.772, 0.671, 0.403, 0.843, 0.487,
    0.749, 0.743, 0.608, 0.67, 0.697, 0.793, 0.616, 0.403, 0.53,
    0.594, 0.529, 0.91, 0.899, 0.621, 0.318, 0.484, 0.942, 0.526,
    0.769, 0.672, 0.758, 0.675, 0.717, 0.665, 0.835, 0.822, 0.797,
    0.78, 0.485, 0.731, 0.542, 0.456, 0.759, 0.385, 0.909, 0.829,
    0.882, 0.507, 0.649, 0.413, 0.865, 0.745, 0.47, 0.703, 0.905,
    0.932, 0.644, 0.634, 0.493, 0.724, 0.735, 0.619, 0.456, 0.775,
    0.716, 0.734, 0.673, 0.486, 0.733, 0.905, 0.914, 0.773, 0.666,
    0.591, 0.759, 0.654, 0.498, 0.544, 0.467), ineq.income. = c(NA,
    14.4, NA, 36.4, 34.4, 10.8, 16.6, 15.1, 4.4, 24.5, 14.8,
    16.1, 11.1, 15.1, 48.5, 19.2, NA, 54.2, 31.1, NA, 37.6, 16.1,
    25.3, 24.9, NA, 21.4, 19.9, 17.1, 28.1, 20.8, 34.1, 29.5,
    43.6, 54, 22, 22.1, 33.7, 27.8, 13.8, 12.2, 20.5, 11.3, 21.3,
    32.6, 23.4, 15.9, 32.7, 17.7, 32.3, 20.8, NA, 13.4, 13.9,
    22.1, 33.3, 25.9, 16.7, 25.4, 19.9, 42.5, 26.8, 32.5, 20.3,
    47.9, 39.7, NA, 14.7, 13.4, 14.7, 14.8, NA, NA, 18.8, 23.7,
    19.4, 24.1, NA, 18.7, 17.6, 28.8, NA, 18.4, 12.2, 15.5, 20.5,
    NA, 39.5, 19, 20.6, 15.7, 19.3, 19.7, 28.7, 19.5, 25.4, NA,
    21.5, NA, 31.6, 63.1, 19.4, 16.4, 12.6, 20.7, 58.1, NA, 68.3,
    26.4, 15.3, NA, 23.8, 21.1, 25.1, 13.1, 10.6, NA, NA, 52.6,
    33.4, 42.7, 28, 18.4, 23.9, 12.2, 11.9, 31.5, NA, NA, 21.1,
    12.2, 22.2, NA, 11.7, 12.2, NA, 40.9, NA, 18.5, 20.8, NA,
    34.9, 13, 14.3, 18.3, 15.3, 17.6, 34, 21.8, 19.2, 20, 21.9,
    21.8, 26.5, 38.7, 26.4, 10.4, 21, 23.5, 26.3, 17.9, NA, 32,
    18.2, 17.6, 20.8, 34.5), ineq.edu = c(39.3, 12.7, NA, 26.2,
    12.1, 6.5, 1.7, 2.4, 12, 7.9, 44.8, NA, 8, 5.2, 19.8, 44.1,
    NA, 28.7, 19.4, NA, 25.7, 8.1, 37.3, 36.3, 30.7, 31.1, 35.3,
    3.2, 45.9, 37.8, 13.3, 23.2, 23.9, 47.4, 30, 29.1, 17.7,
    NA, 15.7, 1.3, 44.8, 3, 47, 22.2, 21.8, 43.6, 29.2, 3.1,
    38.3, 38.2, 11, 4.7, 9.1, 7.3, 44.7, 4.9, 2.3, 7.5, 5.8,
    36.1, 42.6, 40.3, 9.6, 40.7, 31, NA, 4.1, 2.6, 40.6, 21.4,
    NA, NA, 3.2, 7.9, 11.8, 8.3, NA, 25.1, 5.3, 29.2, NA, NA,
    11.1, 30.5, 3.3, NA, 24.9, 46.4, 4.3, 6.2, 30.8, 34.7, NA,
    11.5, 36.9, NA, 43.2, NA, 17.9, 22.4, 7.5, 5.8, 9.6, 42.7,
    28.2, NA, 27.8, 43.3, 4, NA, 33.3, 31.3, 46, 2.4, 46.4, NA,
    NA, NA, 19.8, 30.2, 12.9, 7.1, NA, 13.1, 11.2, 30.7, NA,
    NA, 47.3, 11.1, 48.2, NA, 1.7, 4, NA, 20.8, NA, 5.7, 17.9,
    NA, 20.1, 3.6, 2, 31.5, 9.4, 28.7, 18, 17.5, 44.3, 5.1, 6.6,
    38.7, 27.4, NA, 28.2, 2.8, 2.1, 3.2, 10.8, 1.4, NA, 17, 17.1,
    49.8, 24.2, 20.1), ineq.adj.income = c(NA, 0.593, NA, 0.384,
    0.515, 0.573, 0.755, 0.78, 0.726, 0.616, 0.421, 0.638, 0.681,
    0.776, 0.331, 0.353, NA, 0.273, 0.478, NA, 0.467, 0.635,
    0.297, 0.23, NA, 0.378, 0.407, 0.75, 0.237, 0.35, 0.517,
    0.485, 0.396, 0.184, 0.433, 0.217, 0.484, 0.576, 0.755, 0.739,
    0.391, 0.818, 0.388, 0.476, 0.523, 0.581, 0.434, 0.668, 0.433,
    0.284, NA, 0.785, 0.771, 0.58, 0.279, 0.468, 0.758, 0.384,
    0.682, 0.363, 0.297, 0.269, 0.491, 0.213, 0.331, NA, 0.69,
    0.757, 0.486, 0.567, NA, NA, 0.726, 0.654, 0.717, 0.498,
    NA, 0.558, 0.646, 0.345, NA, 0.705, 0.432, 0.47, 0.627, NA,
    0.31, 0.223, 0.639, 0.819, 0.319, 0.281, 0.573, 0.569, 0.326,
    NA, 0.414, NA, 0.523, 0.197, 0.456, 0.538, 0.652, 0.497,
    0.138, NA, 0.211, 0.333, 0.784, NA, 0.425, 0.249, 0.439,
    0.846, 0.512, NA, NA, 0.355, 0.424, 0.392, 0.458, 0.659,
    0.641, 0.683, 0.72, 0.27, NA, NA, 0.366, 0.639, 0.293, NA,
    0.734, 0.749, NA, 0.425, NA, 0.71, 0.531, NA, 0.485, 0.801,
    0.826, 0.507, 0.421, 0.377, 0.485, 0.557, 0.535, 0.284, 0.672,
    0.544, 0.575, 0.418, 0.3, 0.588, 0.705, 0.719, 0.568, 0.468,
    NA, 0.524, 0.463, 0.466, 0.409, 0.251)), class = "data.frame", row.names = c(NA, -166L))

df1 = structure(list(Country = structure(1:97, .Label = c("Afghanistan",
"Albania", "Armenia", "Australia", "Austria", "Bahamas", "Barbados",
"Belarus", "Belgium", "Belize", "Bermuda", "Bolivia (Plurinational State of)",
"Bosnia and Herzegovina", "Brazil", "Brunei Darussalam", "Bulgaria",
"Cambodia", "Canada", "Chile", "Colombia", "Costa Rica", "Croatia",
"Cyprus", "Czechia", "Denmark", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Estonia", "Finland", "France", "Georgia",
"Germany", "Greece", "Grenada", "Guatemala", "Honduras", "Hong Kong SAR",
"Hungary", "Iceland", "Indonesia", "Iran (Islamic Republic of)",
"Ireland", "Israel", "Italy", "Jamaica", "Japan", "Kuwait", "Lao People's Democratic Republic",
"Latvia", "Lithuania", "Luxembourg", "Malaysia", "Maldives",
"Malta", "Mexico", "Moldova (Republic of)", "Montenegro", "Myanmar",
"Netherlands", "New Zealand", "Nicaragua", "Nigeria", "Norway",
"Pakistan", "Panama", "Paraguay", "Peru", "Philippines", "Poland",
"Portugal", "Puerto Rico", "Romania", "Russian Federation", "Saudi Arabia",
"Serbia", "Slovakia", "Slovenia", "South Africa", "South Korea",
"Spain", "Suriname", "Sweden", "Switzerland", "Taiwan", "Thailand",
"The former Yugoslav Republic of Macedonia", "Trinidad and Tobago",
"Turkey", "Ukraine", "United Kingdom (Scotland)", "United States of America",
"Uruguay", "Venezuela (Bolivarian Republic of)", "Viet Nam",
"Zambia"), class = "factor"), Amph.Average = c(0.5, 0.1, 0.04,
2.1, 0.5, 0.43, 0.2, 0.35, 0.9, 1.35, 0.1, 0.5, 0.5, 0.3, 0.25,
0.65, 0.6, 0.5, 0.27, 0.06, 0.05, 0.7, 0.3, 0.9, 0.7, 0.95, 0.2,
0.48, 0.38, 1, 0.8, 0.2, 0.56, 0.7, 0.05, 0.7, 0.91, 0.84, 0.35,
0.5, 0.93, 0.18, 0.7, 0.4, 1.01, 0.32, 0.81, 0.3, 0.27, 1.39,
0.3, 0.2, 0.73, 0.55, 0.1, 0.93, 0.12, 0.03, 0.72, 0.22, 0.4,
1.6, 0.76, 1.4, 0.7, 0.08, 1.2, 0.51, 0.2, 1.1, 1.9, 0.05, 0.43,
0.03, 0.39, 0.4, 0.1, 0.12, 0.72, 1.02, 0.12, 0.6, 0.5, 0.8,
0.63, 0.6, 1.4, 0.33, 0.7, 0.1, 0.72, 1.2, 1.8, 0.01, 0.47, 0.22,
0.1)), class = "data.frame", row.names = c(NA, -97L))

попытка объединения = Объединенный фрейм

Здравствуйте!Я пытаюсь объединить два фрейма данных, которые имеют разную длину, но с соответствующим столбцом, где один может рассматриваться как подмножество большего.Мой фрейм данных о неравенстве состоит из 166 стран (строк), а другой фрейм данных состоит из 97 стран (строк).Я пытаюсь объединить эти два вместе в один фрейм данных, оставив NA в 69 странах, которые не совпадают.

Я пробовал testmerge = merge.data.frame(df2, df1, by.x = "Country", by.y = "Country", all.x = "TRUE") и многие варианты этого, однако этовыводит все данные из большего фрейма данных, создавая все строки Amph.Average NA.

Как мне объединить эти два фрейма данных, где у меня есть все столбцы больших данныхфрейм и столбец Amph.Average меньшего фрейма данных в одном фрейме данных, оставляя NA там, где нет совпадения?

Я также пробовал dplyr::left_join(df2, df1) и plyr::join(df2, df1, type="full), которые дают в основном то же самоевыход.

1 Ответ

0 голосов
/ 30 ноября 2018

Проблема связана с дополнительным начальным пробелом в df2$Country (которого нет в df1$Country).

Мы можем использовать trimws для обрезки пробелов перед объединением:

merge(df1, transform(df2, Country = trimws(Country)), by = "Country", all.y = T)
#                                      Country Amph.Average ineq.HDI
#1                                 Afghanistan         0.50    0.463
#2                                     Albania         0.10    0.741
#3                                     Armenia         0.04    0.728
#4                                   Australia         2.10    0.923
#5                                     Austria         0.50    0.895
#6                                     Bahamas         0.43    0.789
#7                                    Barbados         0.20    0.782
#8                                     Belarus         0.35    0.792
#9                                     Belgium         0.90    0.903
#10                                     Belize         1.35    0.699
#...

Или в tidyverse

library(tidyverse)
full_join(df2, df1 %>% mutate(Country = trimws(Country)))
...