Другой вариант с использованием data.table :
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4(df1)
вывод:
ID Year V1 V2 rn V
1: C1001 2001 45 70 6 TRUE
2: C1001 2002 74 78 6 TRUE
3: C1001 2003 48 9 6 TRUE
4: C1001 2004 27 32 6 TRUE
5: C1001 2005 39 3 6 TRUE
6: C1001 2006 NA 89 6 TRUE
7: C1001 2007 22 2 6 TRUE
8: C1001 2008 56 12 6 TRUE
9: C1001 2009 29 34 6 TRUE
10: C1001 2010 30 53 6 TRUE
11: C1001 2011 61 46 6 TRUE
12: C1001 2012 23 42 6 TRUE
13: C1008 2002 NA 95 24 TRUE
14: C1008 2003 71 64 24 TRUE
15: C1008 2004 41 92 24 TRUE
16: C1008 2005 45 28 24 TRUE
17: C1008 2006 74 59 24 TRUE
18: C1008 2007 45 16 24 TRUE
19: C1008 2008 57 64 24 TRUE
20: C1008 2009 NA 35 24 TRUE
21: C1008 2010 NA 2 24 TRUE
22: C1008 2011 32 27 24 TRUE
23: C1008 2012 69 41 24 TRUE
24: C1009 2005 30 24 44 TRUE
25: C1009 2006 43 49 44 TRUE
26: C1009 2007 50 NA 44 FALSE
27: C1009 2008 28 72 44 TRUE
28: C1009 2009 43 20 44 TRUE
29: C1012 1996 36 73 52 TRUE
30: C1012 1997 52 4 52 TRUE
31: C1012 1998 67 14 52 TRUE
32: C1012 1999 39 59 52 TRUE
33: C1012 2000 56 12 52 TRUE
34: C1012 2001 25 92 52 TRUE
35: C1012 2002 26 NA 52 FALSE
36: C1012 2003 73 11 52 TRUE
37: C1012 2004 39 50 52 TRUE
38: C1012 2005 65 89 52 TRUE
39: C1012 2006 70 21 52 TRUE
40: C1012 2007 54 86 52 TRUE
41: C1012 2008 37 70 52 TRUE
42: C1012 2009 66 22 52 TRUE
ID Year V1 V2 rn V
data:
library(data.table)
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
setDT(df1)[, rn := .I]
cols <- paste0("V", 1:5)
временный код для данных с большим количеством строк и большим числом групп:
set.seed(0L)
if ((BIGDATA <- TRUE)) {
nr <- 1e7
nc <- 5
nid <- 1e5
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
} else {
df1 <- data.frame(ID=(rep(c("C1001","C1008","C1009","C1012"),each=17)),
Year=(rep(c(1996:2012),4)), V1=(floor(runif(68,20,75))), V2=
(floor(runif(68,1,100))))
df1[1:5,3]<-NA
df1[18:23,4]<-NA
df1[35:42,4]<-NA
df1[49:51,3]<-NA
df1[66:68,3]<-NA
set.seed(123)
df1$V1[rbinom(68,1,0.1)==1]<-NA
df1$V2[rbinom(68,1,0.1)==1]<-NA
dat <- setDT(df1)[, rn := .I]
cols <- paste0("V", 1:2)
}
DT0 <- copy(dat)
DT1 <- copy(dat)
DT2 <- copy(dat)
DT3 <- copy(dat)
DT4 <- copy(dat)
f0 <- function(DT) {
DT[DT[, Reduce('&',
lapply(.SD, function(x) {
r <- rleid(x)
!(r %in% c(1, max(r)) & is.na(x))
})),
ID,
.SDcols=cols]$V1]
}
f1 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
idx <- DT[is.na(get(x)) & ID %in% DT[is.na(get(x)) & (rid==1L | rev_rid==1L), ID],
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1
DT <- DT[!rn %in% idx]
}
DT
}
f2 <- function(DT) {
DT[, c("rn", "rid") := .(.I, rowid(ID))][.N:1L, rev_rid := rowid(ID)]
for (x in cols) {
DT <- DT[!rn %in% DT[is.na(get(x)),
if (rid[1L]==1L || rev_rid[.N]==1L) rn,
cumsum(c(0L, diff(rn) > 1L))]$V1]
}
DT
}
f3 <- function(DT) {
DT[, rn := .I]
rows <- DT[, transpose(lapply(.SD, function(x) c(rn[match(TRUE, !is.na(x))],
rev(rn)[match(TRUE, !is.na(rev(x)))]))),
ID, .SDcols=cols][, .(S=max(V1), E=min(V2)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
f4 <- function(DT) {
setindex(DT, ID)
DT[, rn := .I]
uid <- DT[,.(ID=unique(ID), V=TRUE)]
rows <- rbindlist(lapply(cols, function(x) {
merge(
DT[, V := !is.na(get(x))][uid, on=c("ID", "V"), mult="first", .(ID, S=rn)],
DT[uid, on=c("ID", "V"), mult="last", .(ID, E=rn)],
by="ID")
}))[, .(S=max(S), E=min(E)) , ID]
DT[rows, on=.(ID, rn>=S, rn<=E), .SD]
}
microbenchmark::microbenchmark(f0(DT0), f1(DT1), f2(DT2), f3(DT3), f4(DT4), times=3L)
тайминги:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 8.874985 8.950951 8.993281 9.026917 9.052429 9.077942 3
f1(DT1) 16.249656 16.337013 16.657910 16.424370 16.862038 17.299706 3
f2(DT2) 18.225748 18.284212 18.391198 18.342676 18.473922 18.605169 3
f3(DT3) 10.361079 10.612313 10.698897 10.863548 10.867806 10.872063 3
f4(DT4) 3.106936 3.137846 3.173174 3.168755 3.206293 3.243830 3
еще один тест с таким же количеством строк, но с гораздо меньшим числомгруппы:
set.seed(0L)
nr <- 1e7
nc <- 5
nid <- 1e2
dat <- data.table(ID=sample(nid, nr, TRUE),
as.data.table(matrix(sample(c(1, NA), nr*nc, TRUE), ncol=nc)),
key="ID")
cols <- paste0("V", 1:5)
DT0 <- copy(dat)
DT3 <- copy(dat)
microbenchmark::microbenchmark(f0(DT0), f3(DT3), f4(DT4), times=3L)
сроки:
Unit: seconds
expr min lq mean median uq max neval
f0(DT0) 2.317905 2.331944 2.358256 2.345983 2.378431 2.410880 3
f3(DT3) 2.108385 2.123889 2.132315 2.139392 2.144280 2.149168 3
f4(DT4) 2.050805 2.079687 2.101211 2.108569 2.126414 2.144258 3