Одним из более быстрых способов было бы использовать cross () от tidyr для пересечения df1 и df2, установить флаг для строки в новом фрейме данных, а затем использовать aggregate (), чтобы уменьшить количество строк вниз.Этот метод предполагает, что в df1 нет повторяющихся записей.Если они есть, они будут объединены.
> df1
DateTime REG
1 2018-07-01 12:00:00 NHDG
2 2018-07-12 11:55:23 NSKR
> df2
StartDateTime EndDateTime Direction
1 2018-07-01 07:55:11 2018-07-01 12:01:56 W
2 2018-07-12 11:00:23 2018-07-12 11:45:00 E
> # Create a DF with rows for each combination of df1 rows with df2 rows
> tmp <- crossing(df1, df2)
> tmp
DateTime REG StartDateTime EndDateTime Direction
1 2018-07-01 12:00:00 NHDG 2018-07-01 07:55:11 2018-07-01 12:01:56 W
2 2018-07-01 12:00:00 NHDG 2018-07-12 11:00:23 2018-07-12 11:45:00 E
3 2018-07-12 11:55:23 NSKR 2018-07-01 07:55:11 2018-07-01 12:01:56 W
4 2018-07-12 11:55:23 NSKR 2018-07-12 11:00:23 2018-07-12 11:45:00 E
> # Create a new column for the flag
> tmp$flag <- tmp$DateTime >= tmp$StartDateTime & tmp$DateTime <= tmp$EndDateTime
> tmp
DateTime REG StartDateTime EndDateTime Direction flag
1 2018-07-01 12:00:00 NHDG 2018-07-01 07:55:11 2018-07-01 12:01:56 W TRUE
2 2018-07-01 12:00:00 NHDG 2018-07-12 11:00:23 2018-07-12 11:45:00 E FALSE
3 2018-07-12 11:55:23 NSKR 2018-07-01 07:55:11 2018-07-01 12:01:56 W FALSE
4 2018-07-12 11:55:23 NSKR 2018-07-12 11:00:23 2018-07-12 11:45:00 E FALSE
> # Drop the unwanted columns
> tmp <- tmp[,c("DateTime", "REG", "flag")]
> tmp
DateTime REG flag
1 2018-07-01 12:00:00 NHDG TRUE
2 2018-07-01 12:00:00 NHDG FALSE
3 2018-07-12 11:55:23 NSKR FALSE
4 2018-07-12 11:55:23 NSKR FALSE
> # Sum all flags for a given df1 date and limit total to 1
> df1 <- aggregate(flag ~ DateTime + REG, tmp, FUN = function(x) {min(1, sum(x))})
> df1
DateTime REG flag
1 2018-07-01 12:00:00 NHDG 1
2 2018-07-12 11:55:23 NSKR 0
>
Запуск с большим количеством дат и сравнение с вашим исходным циклом for и приведенным выше методом sapply ():
Original for loop method: 6.282 sec elapsed
sapply() method: 1.65 sec elapsed
crossing() and aggregate(): 0.385 sec elapsed
Полный скриптздесь:
#!/usr/bin/env Rscript
library(tictoc)
library(tidyr)
# Setup: generate a lot of dates for performance comparison
beg <- as.POSIXct("2018-07-01 12:00:00")
end <- as.POSIXct("2100-12-01 12:00:00")
dates <- seq(beg, end, 60*60*24)
#df1 <- data.frame(c("2018-07-01 12:00:00", "2018-07-12 11:55:23"), c("NHDG","NSKR"))
df1 <- data.frame(dates, rep(c("NHDG","NSKR"), length(dates)/2))
df2 <- data.frame(c("2018-07-01 07:55:11", "2018-07-12 11:00:23"), c("2018-07-01 12:01:56", "2018-07-12 11:45:00"), c("W","E"))
colnames(df1) <- c("DateTime", "REG")
colnames(df2) <- c("StartDateTime","EndDateTime","Direction")
df1$DateTime <- as.POSIXct(df1$DateTime, tz = "America/Los_Angeles")
df2$StartDateTime <- as.POSIXct(df2$StartDateTime, tz = "America/Los_Angeles")
df2$EndDateTime <- as.POSIXct(df2$EndDateTime, tz = "America/Los_Angeles")
# Original (fixed)
tic(sprintf("%30s", "Original for loop method"))
for (i in 1:nrow(df1)){
df1$flag[i] <- 0
for (j in 1:nrow(df2)){
if ((df1$DateTime[i] >= df2$StartDateTime[j]) & (df1$DateTime[i] <= df2$EndDateTime[j])){
df1$flag[i]<-1
break
}
}
}
toc()
result1 <- df1
df1$flag <- NULL
# Sapply
tic(sprintf("%30s", "sapply() method"))
df1$flag = sapply(df1$DateTime,
function(x) as.integer(sum(x >= df2$StartDateTime &
x <= df2$EndDateTime) > 0))
toc()
result2 <- df1
df1$flag <- NULL
# Aggregate
tic(sprintf("%30s", "crossing() and aggregate()"))
# Create a DF with rows for each combination of df1 rows with df2 rows
tmp <- crossing(df1, df2)
# Create a new column for the flag
tmp$flag <- tmp$DateTime >= tmp$StartDateTime & tmp$DateTime <= tmp$EndDateTime
# Drop the unwanted columns
tmp <- tmp[,c("DateTime", "REG", "flag")]
# Sum all flags for a given df1 date and limit total to 1
df1 <- aggregate(flag ~ DateTime + REG, tmp, FUN = function(x) {min(1, sum(x))})
# Sort the rows by date
df1 <- df1[order(df1$DateTime),]
# Reset the row names (for comparison below)
rownames(df1) <- NULL
toc()
result3 <- df1
# Prove that results are the same
if (!all.equal(result1, result2)) {
print("MISMATCH")
stop()
}
if (!all.equal(result1, result3)) {
print(MISMATCH)
stop()
}
print("PASS")