Объединение фреймов данных с пропущенными значениями - PullRequest
0 голосов
/ 16 мая 2018

У меня есть несколько фреймов данных с данными одного и того же опроса. Я хочу объединить их для анализа. Фреймы данных содержат как уникальные переменные, так и две переменные (ID и Contest_no), которые являются общими для всех фреймов данных; две общие переменные содержат информацию об респонденте и номер конкурса (1,2,3, поскольку респондентам задавали одни и те же вопросы три раза).

Сложность состоит в том, что у фреймов данных отсутствуют значения:

DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
             V2 = factor(c("A", "B", "C", "D")),
             ID = factor(c("x1", "x1", "y2", "y2")),
             Contest_no = factor(c("1", "2", "1", "2")))

DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
                  V4 = factor(c("A", "C", "D")),
                  ID = factor(c("x1", "y2", "y2")),
                  Contest_no = factor(c("1", "1", "2")))

DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
                  V6 = factor(c("A", "B", "C")),
                  ID = factor(c("x1", "x1", "y2")),
                  Contest_no = factor(c("1", "2", "1")))

В результате идентификаторы респондентов и номера участников конкурса не совпадают. Я хочу сопоставить данные с IDS респондента и номерами соревнований, чтобы объединенный фрейм данных выглядел следующим образом:

DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
                    V2 = factor(c("A", "B", "C", "D")),
                    V3 = factor(c("A", NA, "C", "D")),
                    V4 = factor(c("A", NA, "C", "D")),
                    V5 = factor(c("A", "B", "C", NA)),
                    V6 = factor(c("A", "B", "C", NA)),
                    ID = factor(c("x1", "x1", "y2", "y2")),
                    Contest_no = factor(c("1", "2", "1", "2")))  

Я думал, что full_join справится, но DF_merged <- full_join(DF1, DF2, DF3, by="ID") даст мне бессмысленные результаты.

Как можно объединить такие разнородные данные?

Новый обновленный пример (для решения проблемы умноженных строк). В этом примере отсутствуют пропущенные значения вообще, и оба кадра данных имеют одинаковое количество строк, но код приводит к умноженным строкам. Сначала объединяем два кадра данных:

df1:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L, 
    2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234", 
    "V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 
    1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

df2:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L, 
    3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252", 
    "V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

и теперь неудачная попытка объединить два кадра данных:

df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))

Результат:

structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 
    1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), 
    Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
    3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
    ), class = "factor"), Attribute1.x = structure(c(1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor"), Option.y = structure(c(1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L), 
    Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L, 
    1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 
    2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
    ), class = "factor"), Attribute1.y = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x", 
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x", 
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y", 
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))

1 Ответ

0 голосов
/ 16 мая 2018

Вы можете попробовать dplyr::full_join с аргументом by=c("ID","Contest_no") как:

library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
       full_join(DF3, by=c("ID","Contest_no"))
df1
#  V1 V2   V3   V4   V5   V6 ID Contest_no
#1  A  A    A    A    A    A x1          1
#2  B  B <NA> <NA>    B    B x1          2
#3  C  C    C    C    C    C y2          1
#4  D  D    D    D <NA> <NA> y2          2

Обновлено: Ответ был изменен для рассмотрения другого столбца Option в full_join как:

df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option")) 

Примечание: Мне пришлось настроить свой dplyr, чтобы он соответствовал тому, что предлагает @Gregor, чтобы получить ожидаемый результат.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...