У меня есть датафрейм:
structure(list(PS_position = c(1000L, 1050L, 1100L, 2000L, 3000L,
4000L, 5000L, 6000L, 7000L, 8000L, 8500L, 9000L, 9050L, 9001L,
9051L, 9002L, 9052L, 9003L, 9053L, 9004L, 9054L, 9001L, 9051L,
9002L, 9052L, 9003L, 9053L, 10000L, 9000L, 9200L, 9340L, 9350L,
9550L, 9560L), Region_ID = c("BRCA", "BRCA", "BRCA", "BRCA",
"BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA",
"BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA",
"BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA", "BRCA",
"WNT", "WNT", "WNT", "WNT", "WNT", "WNT"), SNP_ID = c(123455L,
123456L, 123457L, 123458L, 123459L, 123460L, 123461L, 123462L,
123463L, 123464L, 123465L, 123466L, 123467L, 123468L, 123469L,
123470L, 123471L, 123472L, 123473L, 123474L, 123475L, 123476L,
123477L, 123478L, 123479L, 123480L, 123481L, 123482L, 56789L,
56790L, 56795L, 56796L, 56816L, 56817L), chr_key = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), probe_type = c("preselected", "preselected", "preselected",
"preselected", "preselected", "preselected", "preselected", "preselected",
"validated", "validated", "validated", "validated", "validated",
"validated", "validated", "validated", "unvalidated ", "unvalidated ",
"unvalidated ", "unvalidated ", "unvalidated ", "unvalidated ",
"unvalidated ", "unvalidated ", "non_polymorphic", "non_polymorphic",
"non_polymorphic", "non_polymorphic", "preselected", "preselected",
"preselected", "unvalidated ", "non_polymorphic", "validated"
)), class = "data.frame", row.names = c(NA, -34L))
Я хотел бы суммировать строки в столбце PS_position
.
Обычно я делал что-то вроде этого:
out_df <- setNames(data.frame(matrix(ncol = 1, nrow=length(unique(selection$Region_ID)))), c("Region_ID"))
out_df$Region_ID <- unique(unique(selection$Region_ID))
x <- selection %>%
group_by(Region_ID) %>%
summarise(num_probesets = length(probe_type))
out_df <- merge(out_df, x, by="Region_ID")
y <- df %>%
group_by(Region_ID, probe_type) %>%
summarise(num_probesets = length(probe_type))
y_wide <- spread(y, probe_type, num_probesets)
out_df <- merge(out_df, y_wide, by="Region_ID")
Это дает следующий результат:
structure(list(Region_ID = c("BRCA", "WNT"), num_probesets = c(28L,
6L)), row.names = c(NA, -2L), class = "data.frame")
Однако group_by
на Region_ID
отсутствует тот факт, что некоторые значения PS_position
могут появляться в обоих регионах на основе этого фрейма данных. Поэтому я хочу суммировать PS_position
из диапазона> = start
и позиции <= <code>stop в следующем кадре данных:
structure(list(region_name = c("BRCA", "WNT"), gene_symbol = c("BRCA",
"BRCA"), chr = c(1L, 1L), start = c(1000L, 9000L), stop = c(10000L,
10000L), resolution = c("fixed", "fixed"), genome_build = c("hg38",
"hg38"), Notes = c(NA, NA), overlap = c("no_overlap", "no_overlap"
), length_of_region = c(9000L, 1000L), length_of_region_plus_flanks = c(10000L,
2000L), region_probesets = c(50L, 50L), flank_start_left = c(499L,
8499L), flank_stop_left = c(999L, 8999L), flank_start_right = c(10001L,
10001L), flank_stop_right = c(10501L, 10501L), flank_length_left = c(500L,
500L), flank_length_right = c(500L, 500L), flank_probes_right = c(10L,
10L), flank_probes_left = c(10L, 10L), total_expected_probesets = c(70L,
70L), critical_error = c(FALSE, FALSE), flank_name_left = c("BRCA_left",
"WNT_left"), flank_name_right = c("BRCA_right", "WNT_right"),
countrgn = c(50L, 50L)), class = "data.frame", row.names = c(NA,
-2L))
Следовательно, мой окончательный результат должен выглядеть следующим образом:
structure(list(Region_ID = c("BRCA", "WNT"), num_probesets = c(34L,
23L)), row.names = c(NA, -2L), class = "data.frame")
Есть идеи, как это сделать?