При использовании merge.data.table()
я получаю ужасные сообщения об ошибках по поводу дублированных имен столбцов - как ни странно, только если существует более трех таблиц data.table.Напротив, стандартный merge()
с data.frame
версиями работает нормально.
Я использую этот код , чтобы достичь полного объединения, избегая разбитых по времени столбцов с merge.data.table()
.
Итак, в base R
Reduce(function(...) merge(..., all=TRUE), L)
работает нормально, тогда как в data.table
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt, names))), L.dt)
выдает ошибку:
Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,
p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove
or rename the duplicate(s) and try again.
In addition: Warning message:
In merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove
or rename the duplicate(s) and try again.
Обратите внимание , что проблема, похоже, накапливается ...
Хотя
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:2], names))), L.dt[1:2])
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:3], names))), L.dt[1:3])
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:4], names))), L.dt[1:4])
все еще работает нормально,
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:5], names))), L.dt[1:5])
выдает предупреждение, и отсюда
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:6], names))), L.dt[1:6])
, наконец, начинается ошибка.
Таким образом, merge.data.table()
, кажется, работает нормально только до числаof data.tables превышает 3 ?? Есть идеи, как мне решить эту проблему?
Примечание: С easy example data Ошибка не появляется.Так что это может быть вызвано моими данными.Но я понятия не имею, почему - стандарт merge()
работает нормально! Мои реальные данные имеют примерно dim = 15.000 * 500, некоторые имена переменных совпадают по всему списку, другие индивидуальны.Этот MCVE, похоже, достаточно хорошо воспроизводит проблему.
Данные и код для воспроизведения
# names vector mimicking my data
nm <- list(c("p.d17m", "p.d17", "p.d29", "p.d31", "p.n03", "p.n04",
"p.n05", "p.d36", "p.d40", "p.d41", "p.d45", "p.d46", "p.d50",
"p.d51", "p.d55", "p.d56", "p.d60", "p.d61", "p.d65", "p.d66",
"p.d70", "p.d71", "p.n08"), c("p.d17m", "p.d17", "p.c44", "p.l01",
"p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10",
"p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18",
"p.l20", "p.l21", "p.l22", "p.l23", "p.l25"), c("p.d17m", "p.d17",
"p.c44", "p.l01", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07",
"p.l08", "p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16",
"p.l17", "p.l18", "p.l20", "p.l21", "p.l22", "p.l23", "p.l25"
), c("p.d17m", "p.d17", "p.c44", "p.l01", "p.l01r", "p.l02",
"p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11",
"p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18", "p.l20",
"p.l21", "p.l22", "p.l23"), c("p.d17m", "p.d17", "p.c44", "p.l01",
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08",
"p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17",
"p.l18", "p.l20", "p.l21", "p.l22", "p.l23"), c("pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.l01", "p.l01r", "p.l02", "p.l03",
"p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11", "p.l12"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.l01",
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08",
"p.l10", "p.l11", "p.l12"), c("pdate.", "p.d200", "p.d201", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05"), c("pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47",
"p.c48", "p.c49", "p.c50", "p.l01", "p.l01r", "p.l02", "p.l03",
"p.l05", "p.l06", "p.l07"), c("plingu.", "pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.c60", "p.c61", "p.c62", "p.c63", "p.c64", "p.c65"), c("hab_ch.",
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47",
"p.c48", "p.c49", "p.c50", "p.c60", "p.c61", "p.c62", "p.c63",
"p.c64"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17",
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166",
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.c60",
"p.c61", "p.c62", "p.c63", "p.c64"), c("hab_ch.", "plingu.",
"pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", "p.d163",
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101",
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01"
), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a",
"p.d110b", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164",
"p.d165", "p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49",
"p.c50", "p.l01", "p.l01r", "p.l02"), c("hab_ch.", "plingu.",
"pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b", "p.d160",
"p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166", "p.d167",
"p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01", "p.l01r",
"p.l02"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17",
"p.d110a", "p.d110b", "p.d160", "p.d161", "p.d162", "p.d163",
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101",
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49"), c("hab_ch.",
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b",
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166",
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01",
"p.l01r", "p.l02"))
n <- 10 # row numbers
fun <- function(x) {
# fun() produces a n x 22 data.frame
set.seed(x)
d <- data.frame(
cbind(id=1:n, yr=x,
matrix(rnorm(n*(20)), n)))
}
tmp <- setNames(lapply(1:6, fun), paste0("df", 1:6)) # produce six data frames
list2env(tmp, globalenv()) # get data frames into globalenv
# insert extra columns into some df's
df3 <- cbind(id=df3[, 1], foo=sample(1e3:2e3, nrow(df3), replace=TRUE), df3[, -1])
df4 <- cbind(id=df4[, 1], foo=sample(1e3:2e3, nrow(df4), replace=TRUE), df4[, -1])
df5 <- cbind(id=df5[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df5[, -1])
df6 <- cbind(id=df6[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df6[, -1])
# giving var names:
names(df1)[3:22] <- nm[[1]][3:22]
names(df2)[3:22] <- nm[[2]][3:22]
names(df3)[4:23] <- nm[[3]][4:23]
names(df4)[4:23] <- nm[[4]][4:23]
names(df5)[4:23] <- nm[[5]][4:23]
names(df5)[4:23] <- nm[[6]][4:23]
# list of data.frames:
L <- list(df1, df2, df3, df4, df5, df6)
# list of data.tables:
library(data.table)
L.dt <- lapply(L, function(x) setkeyv(data.table(x), c("id", "yr")))
Также связанные: 1 , 2