Подмножество особого случая в R - PullRequest
0 голосов
/ 22 мая 2018

У меня есть набор данных, который содержит несколько числовых столбцов

x = 
a b c d
1 2 3 2
2 3 5 1
1 3 4 6
. . . .
. . . .

другой список, который у меня есть, где некоторые правила определены в терминах диапазона для каждой переменной

y = 
y[[1]]
y[[1]][[1]]
1
2
y[[1]][[2]]
5
6
y[[2]]
y[[2]][[1]]
2
3
y[[3]]
y[[3]][[1]]
5
6
y[[3]][[2]]
8
9
y[[3]][[3]]
10
11
y[[4]]    
y[[4]][[1]]
12
15

Я хочусоздавать все возможные кадры данных, когда берется каждая комбинация переменных, например:

dataframe1 = variable 1 is from 1:2,
variable 2 is from 2:3, 
variable 3 is from 5:6 and 
variable 4 is from 12:15 is one data frame

dataframe2 = variable 1 is from 5:6, 
variable 2 is from 2:3, 
variable 3 i from 5:6 and 
variable 4 is from 12:15 is one data frame 

и т. д. *

выборка данных

structure(list(c(2441.321994, 2441.295962, 2441.269929, 2441.243897, 
2441.217865, 2441.191832, 2441.1658, 2441.139767, 2441.113735, 
2441.087702, 2441.06167, 2441.035637, 2441.009605, 2440.983572, 
2440.95754, 2440.931507, 2440.905475, 2440.879443, 2440.85341, 
2440.827378), c(998.6875404, 998.6989679, 998.7103955, 998.721823, 
998.7332505, 998.744678, 998.7561055, 998.767533, 998.7789605, 
998.7903881, 998.8018156, 998.8132431, 998.8246706, 998.8360981, 
998.8475256, 998.8589531, 998.8703806, 998.8818082, 998.8932357, 
998.9046632), c(4272184.479, 4272207.767, 4272231.055, 4272254.344, 
4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 
4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 
4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 
4272626.958), c(560.8931785, 562.3330729, 563.0583984, 562.3908637, 
563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 
563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 
560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 
561.2869919)), row.names = c(NA, 20L), class = "data.frame")

выборка списка

list(list(c(2400, 2430), c(2435, 2530), c(2600,2700)), list(c(900, 950), c(960,1000)), list(c(4272184,4272417)), list(c(560,561), c(562:563)))

В этом примере желаемый вывод - 2 кадра данных, где:

DF1 = var1 between 2400 2430, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF2 = var1 between 2435 2530, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF3 = var1 between 2600 2700, var2 between 900 950, var3 between 4272184 4272417, var4 between 560 561
DF4 = var1 between 2400 2430, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF5 = var1 between 2435 2530, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF6 = var1 between 2600 2700, var2 between 960 1000, var3 between 4272184 4272417, var4 between 560 561
DF7 = var1 between 2400 2430, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF8 = var1 between 2435 2530, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF9 = var1 between 2600 2700, var2 between 900 950, var3 between 4272184 4272417, var4 between 562 563
DF10 = var1 between 2400 2430, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563
DF11 = var1 between 2435 2530, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563
DF12 = var1 between 2600 2700, var2 between 960 1000, var3 between 4272184 4272417, var4 between 562 563

1 Ответ

0 голосов
/ 22 мая 2018

Возможное решение с пакетом data.table:

# load the package
library(data.table)

# create a reference data.table from the list
ref <- as.data.table(lapply(l, unlist))[, grp := rowid(V2)
                                        ][, dcast(.SD, grp ~ rowid(grp),
                                                  value.var = c('V1','V2'))]


# convert 'df' to a data.table
setDT(df)

# use the join capability of data.table to add a grouping variable
# from 'ref' to those rows that fit the criteria
df[, grp := ref[.SD
                , on = .(V1_1 < var1, V1_2 > var1, V2_1 < var2, V2_2 > var2)
                , grp]]


# filter out the rows where no grp was assigned
df2 <- df[!is.na(grp)]

# split the result
split(df2, df2$grp)

, которое дает:

$`1`
        var1     var2    var3     var4 grp
 1: 2421.322 948.6875 4272184 560.8932   1
 2: 2421.270 948.7104 4272231 563.0584   1
 3: 2421.218 948.7333 4272278 563.3388   1
 4: 2421.166 948.7561 4272324 561.9703   1
 5: 2421.114 948.7790 4272371 563.5062   1
 6: 2421.062 948.8018 4272417 562.7148   1
 7: 2421.010 948.8247 4272464 561.3955   1
 8: 2420.958 948.8475 4272511 560.8382   1
 9: 2420.905 948.8704 4272557 563.9938   1
10: 2420.853 948.8932 4272604 562.5879   1

$`2`
        var1     var2    var3     var4 grp
 1: 2441.322 998.6875 4272184 560.8932   2
 2: 2441.296 998.6990 4272208 562.3331   2
 3: 2441.270 998.7104 4272231 563.0584   2
 4: 2441.244 998.7218 4272254 562.3909   2
 5: 2441.218 998.7333 4272278 563.3388   2
 6: 2441.192 998.7447 4272301 562.7498   2
 7: 2441.166 998.7561 4272324 561.9703   2
 8: 2441.140 998.7675 4272347 561.9607   2
 9: 2441.114 998.7790 4272371 563.5062   2
10: 2441.088 998.7904 4272394 563.2322   2
11: 2441.062 998.8018 4272417 562.7148   2
12: 2441.036 998.8132 4272441 561.6984   2
13: 2441.010 998.8247 4272464 561.3955   2
14: 2440.984 998.8361 4272487 560.8978   2
15: 2440.958 998.8475 4272511 560.8382   2
16: 2440.932 998.8590 4272534 562.8976   2
17: 2440.905 998.8704 4272557 563.9938   2
18: 2440.879 998.8818 4272580 563.5401   2
19: 2440.853 998.8932 4272604 562.5879   2
20: 2440.827 998.9047 4272627 561.2870   2

С помощью функции list2env вы можетеразбейте список и поместите таблицы данных в глобальную среду, если хотите.


Используемые данные:

df <- structure(list(var1 = c(2421.321994, 2421.295962, 2421.269929, 2421.243897, 2421.217865, 2421.191832, 2421.1658, 2421.139767, 2421.113735, 2421.087702, 2421.06167, 2421.035637, 2421.009605, 2420.983572, 2420.95754, 2420.931507, 2420.905475, 2420.879443, 2420.85341, 2420.827378, 2441.321994, 2441.295962, 2441.269929, 2441.243897, 2441.217865, 2441.191832, 2441.1658, 2441.139767, 2441.113735, 2441.087702, 2441.06167, 2441.035637, 2441.009605, 2440.983572, 2440.95754, 2440.931507, 2440.905475, 2440.879443, 2440.85341, 2440.827378, 2461.321994, 2461.295962, 2461.269929, 2461.243897, 2461.217865, 2461.191832, 2461.1658, 2461.139767, 2461.113735, 2461.087702, 2461.06167, 2461.035637, 2461.009605, 2460.983572, 2460.95754, 2460.931507, 2460.905475, 2460.879443, 2460.85341, 2460.827378),
                     var2 = c(948.6875404, 898.6989679, 948.7103955, 898.721823, 948.7332505, 898.744678, 948.7561055, 898.767533, 948.7789605, 898.7903881, 948.8018156, 898.8132431, 948.8246706, 898.8360981, 948.8475256, 898.8589531, 948.8703806, 898.8818082, 948.8932357, 898.9046632, 998.6875404, 998.6989679, 998.7103955, 998.721823, 998.7332505, 998.744678, 998.7561055, 998.767533, 998.7789605, 998.7903881, 998.8018156, 998.8132431, 998.8246706, 998.8360981, 998.8475256, 998.8589531, 998.8703806, 998.8818082, 998.8932357, 998.9046632, 1048.6875404, 1098.6989679, 1048.7103955, 1098.721823, 1048.7332505, 1098.744678, 1048.7561055, 1098.767533, 1048.7789605, 1098.7903881, 1048.8018156, 1098.8132431, 1048.8246706, 1098.8360981, 1048.8475256, 1098.8589531, 1048.8703806, 1098.8818082, 1048.8932357, 1098.9046632),
                     var3 = c(4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958, 4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958, 4272184.479, 4272207.767, 4272231.055, 4272254.344, 4272277.632, 4272300.92, 4272324.209, 4272347.497, 4272370.786, 4272394.074, 4272417.362, 4272440.651, 4272463.939, 4272487.227, 4272510.516, 4272533.804, 4272557.093, 4272580.381, 4272603.669, 4272626.958),
                     var4 = c(560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919, 560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919, 560.8931785, 562.3330729, 563.0583984, 562.3908637, 563.3387891, 562.7498197, 561.9703322, 561.9606988, 563.5061892, 563.2322049, 562.7148307, 561.6984375, 561.3954731, 560.8977865, 560.8382398, 562.8975675, 563.9937645, 563.540068, 562.5878996, 561.2869919)),
                .Names = c("var1", "var2", "var3", "var4"), row.names = c(NA, -60L), class = "data.frame")

l <- list(list(c(2400, 2430), c(2435, 2530)), c(900, 1000))

df был создан исходный тип данных OP следующим образом:

names(df) <- paste0('var', seq_along(df))
df <- rbind(df - list(20, c(50,100), 0, 0), df, df + list(20, c(50,100), 0, 0))
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...