Я пытаюсь создать 2 столбца в моем основном фрейме данных, основываясь на датах и логиках c из моего фрейма данных задач (где порядковый номер соответствует). Приведенные ниже 2 решения работают там, где я spread
обрабатываю кадр данных в моем главном, и хотя приведенные ниже примеры выполняются довольно быстро, в моем обычном коде это занимает много памяти и требует много времени для вычислений (основной блок данных имеет 800 000 строк и фрейм данных задачи, состоящий из примерно 20 задач и, вероятно, 1,5M строк).
Есть ли лучшее решение для получения столбцов G1 и G2 (пример 1) и Gate1_end & Gate 2_end (пример 2) без необходимости присоединяться к моей задаче датафрейм к моему основному фрейму данных?
Пример 1
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Location = c("US","US","Canada","US","France","US","Mexico","Mexico","UK","UK"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
"RTQT_Start","RTQR_Start",
"QUOT_Start",
"QUOG_Start",
"RTQT_Start","QUOT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020",
"1/6/2020","1/9/2020",
"1/14/2020",
"1/17/2020",
"1/19/2020","1/8/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020",
"1/22/2020","1/25/2020",
"1/1/2020",
"1/6/2020",
"1/24/2020","1/14/2020"
),
Order = c(145, 145, 145, 145,
158, 158,
165,
148,
568, 568,
465, 465, 465, 465,
248, 248,
693,
357,
482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$G1 <- pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE)
df$G2 <- pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE)
Пример 2 (Очень похоже на пример 1, но более сложные логи c)
maindf <- data.frame(
Order = c(145,158,165,148,568,465,248,693,357,482),
Gates = c("Gate1","Gate2","Gate2","Gate3","Gate2","Gate4","Gate1","Gate1","Gate2","Gate3"),
Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
stringsAsFactors = FALSE
)
Tasksdf_Start <- data.frame(
Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "CRDT_Start", "CUST_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CUST_Start", "CRDT_Start",
"QUOT_Start", "CRDT_Start",
"QUOG_Start", "CUST_Start", "CRDT_Start",
"RTQT_Start","QUOT_Start","CRDT_Start", "CUST_Start",
"RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "VEND_Start",
"RTQT_Start","RTQR_Start", "CRDT_Start",
"QUOT_Start", "VEND_Start", "CRDT_Start",
"QUOG_Start", "VEND_Start",
"RTQT_Start","QUOT_Start", "CRDT_Start"
),
Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", "1/8/2020","1/19/2020","1/8/2020",
"1/6/2020","1/9/2020", "1/1/2020","1/9/2020",
"1/14/2020", "1/1/2020",
"1/17/2020", "1/1/2020", "1/1/2020",
"1/19/2020","1/8/2020", "1/1/2020", "1/1/2020",
"1/15/2020","1/3/2020", "1/6/2020","1/19/2020", "1/1/2020",
"1/22/2020","1/25/2020", "1/1/2020",
"1/1/2020", "1/1/2020", "1/1/2020",
"1/6/2020", "1/1/2020",
"1/24/2020","1/14/2020", "1/1/2020"
),
Order = c(145, 145, 145, 145, 145, 145, 145,
158, 158, 158, 158,
165, 165,
148, 148, 148,
568, 568, 568, 568,
465, 465, 465, 465, 465,
248, 248, 248,
693, 693, 693,
357, 357,
482, 482, 482
),
stringsAsFactors = FALSE
)
Tasksdf_End <- data.frame(
Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
"CUST_End",
"CRDT_End",
"CUST_End",
"CRDT_End", "CUST_End",
"VEND_End",
"CRDT_End",
"VEND_End",
"VEND_End",
"CRDT_End"
),
Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
"1/15/2020",
"1/16/2020",
"1/18/2020",
"1/7/2020", "1/24/2020",
"1/1/2020",
"1/18/2020",
"1/8/2020",
"1/4/2020",
"1/6/2020"
),
Order = c(145, 145, 145,
158,
165,
148,
568, 568,
465,
248,
693,
357,
482
),
stringsAsFactors = FALSE
)
dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
spread(Tasks_Start, Dates)
dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
spread(Tasks_End, Dates)
dfE <- dfE %>%
select(1,4:ncol(dfE))
df <- left_join(dfS, dfE, by = c("Order" = "Order"))
df$Gate1_End <- if_else(df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE))
df <- df %>%
mutate(Gate2Open = rowSums(!is.na(select(.,one_of(c('CRDT_Start', 'CUST_Start', 'VEND_Start'))))),
Gate2Close = rowSums(!is.na(select(.,one_of(c('CRDT_End', 'CUST_End', 'VEND_End')))))
)
df$Gate2_End <- if_else(df$Gates == "Gate2" | df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA),
if_else((df$Gate2Close - df$Gate2Open) == 0,
pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE),
as.character(NA))
)