R: более простой метод, чем обработка данных 2 кадра данных - PullRequest
0 голосов
/ 04 февраля 2020

Я пытаюсь создать 2 столбца в моем основном фрейме данных, основываясь на датах и ​​логиках c из моего фрейма данных задач (где порядковый номер соответствует). Приведенные ниже 2 решения работают там, где я spread обрабатываю кадр данных в моем главном, и хотя приведенные ниже примеры выполняются довольно быстро, в моем обычном коде это занимает много памяти и требует много времени для вычислений (основной блок данных имеет 800 000 строк и фрейм данных задачи, состоящий из примерно 20 задач и, вероятно, 1,5M строк).

Есть ли лучшее решение для получения столбцов G1 и G2 (пример 1) и Gate1_end & Gate 2_end (пример 2) без необходимости присоединяться к моей задаче датафрейм к моему основному фрейму данных?

Пример 1

maindf <- data.frame(
  Order = c(145,158,165,148,568,465,248,693,357,482),
  Location = c("US","US","Canada","US","France","US","Mexico","Mexico","UK","UK"),
  Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
  stringsAsFactors = FALSE
  )

Tasksdf_Start <- data.frame(
  Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start",
            "RTQT_Start","RTQR_Start",
            "QUOT_Start", 
            "QUOG_Start",
            "RTQT_Start","QUOT_Start", 
            "RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", 
            "RTQT_Start","RTQR_Start", 
            "QUOT_Start", 
            "QUOG_Start", 
            "RTQT_Start","QUOT_Start" 
            ),
  Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", 
            "1/6/2020","1/9/2020", 
            "1/14/2020", 
            "1/17/2020", 
            "1/19/2020","1/8/2020", 
            "1/15/2020","1/3/2020", "1/6/2020","1/19/2020", 
            "1/22/2020","1/25/2020", 
            "1/1/2020", 
            "1/6/2020", 
            "1/24/2020","1/14/2020"
            ),
  Order = c(145, 145, 145, 145, 
            158, 158,
            165, 
            148, 
            568, 568, 
            465, 465, 465, 465, 
            248, 248,
            693, 
            357, 
            482, 482
            ),
  stringsAsFactors = FALSE
)

Tasksdf_End <- data.frame(
  Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
            "CUST_End",
            "CRDT_End",
            "CUST_End",
            "CRDT_End", "CUST_End",
            "VEND_End",
            "CRDT_End",
            "VEND_End",
            "VEND_End",
            "CRDT_End"
            ),
  Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
            "1/15/2020",
            "1/16/2020",
            "1/18/2020",
            "1/7/2020", "1/24/2020",
            "1/1/2020",
            "1/18/2020",
            "1/8/2020",
            "1/4/2020",
            "1/6/2020"
            ),
  Order = c(145, 145, 145,
            158,
            165,
            148,
            568, 568,
            465,
            248,
            693,
            357,
            482
            ),
  stringsAsFactors = FALSE
)


dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
  spread(Tasks_Start, Dates)

dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
  spread(Tasks_End, Dates) 
dfE <- dfE %>%
  select(1,4:ncol(dfE))

df <- left_join(dfS, dfE, by = c("Order" = "Order"))

df$G1 <- pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE)

df$G2 <- pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE)

Пример 2 (Очень похоже на пример 1, но более сложные логи c)

maindf <- data.frame(
  Order = c(145,158,165,148,568,465,248,693,357,482),
  Gates = c("Gate1","Gate2","Gate2","Gate3","Gate2","Gate4","Gate1","Gate1","Gate2","Gate3"),
  Animal = c("Cow","Dog","Cat","Dog","Cat","Cow","Horse","Dog","Dog","Cat"),
  stringsAsFactors = FALSE
  )

Tasksdf_Start <- data.frame(
  Tasks_Start = c("RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "CRDT_Start", "CUST_Start", "VEND_Start",
            "RTQT_Start","RTQR_Start", "CUST_Start", "CRDT_Start",
            "QUOT_Start", "CRDT_Start",
            "QUOG_Start", "CUST_Start", "CRDT_Start",
            "RTQT_Start","QUOT_Start","CRDT_Start", "CUST_Start",
            "RTQT_Start","RTQR_Start","QUOT_Start","QUOG_Start", "VEND_Start",
            "RTQT_Start","RTQR_Start", "CRDT_Start",
            "QUOT_Start", "VEND_Start", "CRDT_Start",
            "QUOG_Start", "VEND_Start",
            "RTQT_Start","QUOT_Start", "CRDT_Start"
            ),
  Dates = c("1/1/2020","1/2/2020","1/6/2020","1/20/2020", "1/8/2020","1/19/2020","1/8/2020",
            "1/6/2020","1/9/2020", "1/1/2020","1/9/2020",
            "1/14/2020",  "1/1/2020",
            "1/17/2020", "1/1/2020", "1/1/2020",
            "1/19/2020","1/8/2020", "1/1/2020", "1/1/2020",
            "1/15/2020","1/3/2020", "1/6/2020","1/19/2020", "1/1/2020", 
            "1/22/2020","1/25/2020",  "1/1/2020",
            "1/1/2020", "1/1/2020", "1/1/2020",
            "1/6/2020",  "1/1/2020",
            "1/24/2020","1/14/2020", "1/1/2020"
            ),
  Order = c(145, 145, 145, 145, 145, 145, 145,
            158, 158, 158, 158,
            165, 165,
            148, 148, 148, 
            568, 568, 568, 568,
            465, 465, 465, 465, 465,
            248, 248, 248,
            693, 693, 693, 
            357, 357, 
            482, 482,  482
            ),
  stringsAsFactors = FALSE
)

Tasksdf_End <- data.frame(
  Tasks_End = c("CRDT_End", "CUST_End", "VEND_End",
            "CUST_End",
            "CRDT_End",
            "CUST_End",
            "CRDT_End", "CUST_End",
            "VEND_End",
            "CRDT_End",
            "VEND_End",
            "VEND_End",
            "CRDT_End"
            ),
  Dates = c("1/22/2020", "1/18/2020", "1/5/2020",
            "1/15/2020",
            "1/16/2020",
            "1/18/2020",
            "1/7/2020", "1/24/2020",
            "1/1/2020",
            "1/18/2020",
            "1/8/2020",
            "1/4/2020",
            "1/6/2020"
            ),
  Order = c(145, 145, 145,
            158,
            165,
            148,
            568, 568,
            465,
            248,
            693,
            357,
            482
            ),
  stringsAsFactors = FALSE
)


dfS <- left_join(maindf, Tasksdf_Start, by = c("Order" = "Order")) %>%
  spread(Tasks_Start, Dates)

dfE <- left_join(maindf, Tasksdf_End, by = c("Order" = "Order")) %>%
  spread(Tasks_End, Dates) 
dfE <- dfE %>%
  select(1,4:ncol(dfE))

df <- left_join(dfS, dfE, by = c("Order" = "Order"))


df$Gate1_End <- if_else(df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), pmin(df$QUOG_Start, df$QUOT_Start, df$RTQR_Start, df$RTQT_Start, na.rm = TRUE))

df <- df %>%
  mutate(Gate2Open = rowSums(!is.na(select(.,one_of(c('CRDT_Start', 'CUST_Start', 'VEND_Start'))))),
         Gate2Close = rowSums(!is.na(select(.,one_of(c('CRDT_End', 'CUST_End', 'VEND_End')))))
  )

df$Gate2_End <- if_else(df$Gates == "Gate2" | df$Gates == "Gate1" | df$Gates == "Gate0", as.character(NA), 
                         if_else((df$Gate2Close - df$Gate2Open) == 0, 
                                 pmax(df$CRDT_End, df$CUST_End, df$VEND_End, na.rm = TRUE), 
                                 as.character(NA))
)

1 Ответ

1 голос
/ 04 февраля 2020

Используя mutate, вам не нужно создавать широкие фреймы данных, чтобы добавить сводный столбец:

ПРИМЕР 1:

# calculate min for the START values
start <- Tasksdf_Start %>%
  group_by(Order) %>%
  mutate(G1=min(Dates, na.rm = TRUE)) %>%
  dplyr::select(Order, G1) %>% 
  summarise_each(lst(max)) %>% # merge identical rows
  rename(G1=max)

# calculate max for the END vales
end <- Tasksdf_End %>%
  group_by(Order) %>%
  mutate(G2=max(Dates, na.rm = TRUE)) %>%
  dplyr::select(Order, G2) %>%
  summarise_each(lst(max)) %>% # merge identical rows
  rename(G2=max)

# join everything
maindf <- maindf %>%
  full_join(start, end, by ="Order")

ПРИМЕР 2:

# calculate min for START values
start <- Tasksdf_Start %>%
  group_by(Order) %>%
  mutate(Gate1_End=min(Dates, na.rm = TRUE)) %>%
  dplyr::select(Order, Gate1_End) %>% 
  summarise_each(lst(max)) %>% # merge identical rows
  rename(Gate1_End=max)

# sum up cases defined in c('CRDT_Start', 'CUST_Start', 'VEND_Start')
Gate2Open <- Tasksdf_Start %>%
  filter(Tasks_Start %in% c('CRDT_Start', 'CUST_Start', 'VEND_Start')) %>%
  group_by(Order) %>%
  mutate(Gate2Open = n()) %>%
  dplyr::select(Order, Gate2Open) %>% 
  summarise_each(lst(max)) %>% # merge identical rows
  rename(Gate2Open = max)

# calculate max for END values & sum up cases for "_End" columns
end <- Tasksdf_End %>%
  group_by(Order) %>%
  mutate(Gate2Close = n())%>%
  mutate(Gate2_End=max(Dates, na.rm = TRUE)) %>%
  dplyr::select(Order, Gate2_End, Gate2Close) %>%
  summarise_each(lst(max)) %>% # merge identical rows
  rename(Gate2_End=Gate2_End_max, Gate2Close = Gate2Close_max) 

# join & apply conditions
maindf <- maindf %>%
  full_join(end, by ="Order") %>%
  full_join(start, by = "Order") %>%
  full_join(Gate2Open, by = "Order") %>%
  group_by(Order) %>%
  mutate(Gate1_End = case_when(Gates == "Gate1" | Gates == "Gate2" ~ as.character(NA),
                               Gates != "Gate1" & Gates != "Gate2" ~ Gate1_End), 
         Gate2_End = case_when(Gates == "Gate2" | Gates == "Gate1" | Gates == "Gate0" ~ as.character(NA), 
                               Gate2Close - Gate2Open == 0 ~ Gate2_End))

Может быть, был бы более элегантный способ, но таким образом вы избегаете spread().

...