Если я правильно понял вопрос, то вы можете попробовать data.table
подход
library(data.table)
#convert sample data into data table
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
#identify rows of DT1 which fall under DT2's range (see 'pos_range' column)
#In case of NA (i.e. not found) replace it with row_number so that proper summarisation happens at the end
DT1[DT2, pos_range := paste(V2, V3, sep = '-'),
on = .(col2 >= V2, col3 <= V3)][, .(col1, col2, col3, col4, pos_range)]
DT1[, pos_range := ifelse(is.na(pos_range), .I, pos_range)]
#summarise data
DT <- unique(DT1[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)),
.(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])
Вывод:
> DT
col1 start_pos end_pos value_sum
1: chr19 45770502 45770503 5.263158
2: chr19 45770513 45770542 29.757566
3: chr19 47430080 47430081 0.000000
4: chr19 47430099 47430100 0.000000
...
Обновление: Если вы хотите знать только перекрывающиеся строки, вам нужно просто игнорировать NA
в pos_range
столбце DT1
library(data.table)
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
DT <- DT1[DT2, pos_range := paste(V2, V3, sep = '-'),
on = .(col2 >= V2, col3 <= V3)][!is.na(pos_range), .(col1, col2, col3, col4, pos_range)]
DT <- unique(DT[, c("start_pos", "end_pos", "value_sum") := list(first(col2), last(col3), sum(col4)),
.(col1, pos_range)][, .(col1, start_pos, end_pos, value_sum)])
DT
# col1 start_pos end_pos value_sum
#1: chr19 45770513 45770542 29.75757
Пример данных:
df1 <- structure(list(col1 = c("chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19", "chr19", "chr19", "chr7", "chr7"), col2 = c(45770502L,
45770513L, 45770516L, 45770526L, 45770538L, 45770539L, 45770541L,
47430080L, 47430099L, 47430113L, 47430127L, 47430164L, 47430166L,
47430175L, 47430187L, 47430189L, 47430191L, 47430196L, 47430205L,
47430208L, 47430211L, 47430222L, 47430228L, 23904987L, 23904990L
), col3 = c(45770503L, 45770514L, 45770517L, 45770527L, 45770539L,
45770540L, 45770542L, 47430081L, 47430100L, 47430114L, 47430128L,
47430165L, 47430167L, 47430176L, 47430188L, 47430190L, 47430192L,
47430197L, 47430206L, 47430209L, 47430212L, 47430223L, 47430229L,
23904988L, 23904991L), col4 = c(5.26315789473684, 3.17460317460317,
6.56063618290259, 7.3558648111332, 5.81162324649299, 0, 6.85483870967742,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("col1",
"col2", "col3", "col4"), class = "data.frame", row.names = c(NA,
-25L))
df2 <- structure(list(V1 = c("chr19", "chr19", "chr19", "chr19", "chr19",
"chr19", "chr19"), V2 = c(45770509L, 45770921L, 45770981L, 45770995L,
45771012L, 45771532L, 45774947L), V3 = c(45777447L, 45772712L,
45772504L, 45772504L, 45772504L, 45775268L, 45777037L), V4 = c("uc061acd.1",
"uc061ace.1", "uc061acf.1", "uc061acg.1", "uc061ach.1", "uc061aci.1",
"uc061acj.1"), V5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V6 = c("-",
"-", "+", "+", "+", "-", "-"), V7 = c(45770509L, 45771157L, 45770981L,
45770995L, 45771012L, 45771532L, 45774947L), V8 = c(45777447L,
45772712L, 45770981L, 45770995L, 45771012L, 45771532L, 45774947L
), V9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), V10 = c(5L, 4L, 3L, 3L,
3L, 4L, 2L), V11 = c("131,98,112,86,121,", "475,98,158,72,",
"98,186,199,", "84,95,199,", "67,86,199,", "133,158,112,320,",
"87,379,"), V12 = c("0,1058,2131,4439,6817,", "0,646,849,1719,",
"0,508,1324,", "0,594,1310,", "0,577,1293,", "0,238,1108,3416,",
"0,1711,")), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7",
"V8", "V9", "V10", "V11", "V12"), class = "data.frame", row.names = c(NA,
-7L))