У меня есть таблица данных MutationsNumberTable_Neighbours
с номерами мутаций для каждого гена (переменная HGNC
) в разных раках.Другая таблица Neighbors_table
предназначена для сопоставления гена и его ближайших соседей по хромосоме.
MutationsNumberTable_Neighbours = merge(MutationsNumberTable_Neighbours, Neighbors_table, by = "Gene")
MutationsNumberTable_Neighbours[1:3,]
HGNC ACC BLCA BRCA CESC CHOL COAD/READ_MSI COAD/READ_POLE COAD/READ_REST DLBC ESCA GBM HNSC KICH KIRC KIRP LGG LIHC LUAD_HEAVY LUAD_REST LUSC_HEAVY LUSC_REST MESO OV PAAD PCPG PRAD SARC SKCM_HIGH SKCM_LOW STAD TGCT THCA THYM UCEC_MSI
1: A1BG 1 5 2 1 0 1 5 2 0 0 2 7 0 1 0 2 3 4 3 1 3 0 0 3 0 2 0 2 2 7 0 0 0 6
2: A1CF 1 5 4 4 1 3 8 0 0 0 4 13 0 2 0 4 5 6 5 5 16 0 1 2 0 6 2 58 15 11 0 0 0 0
3: A2M 0 20 9 9 0 7 23 9 1 6 9 12 0 6 3 5 6 18 5 7 22 0 0 2 1 4 6 47 16 40 1 1 0 2
UCEC_POLE UCEC_REST UCS UVM_HIGH UVM_LOW
1: 14 1 0 0 0
2: 41 0 1 0 0
3: 72 6 1 0 0
Neighbors
1: ZNF134,ZNF606,SLC27A5,ZNF419,ZSCAN18,ZNF814,C19orf18,RPS5,ZNF548,ZNF135,ZNF324,ZBTB45,ZNF17,ZNF530,ZNF211,ZNF551,ZNF324B,ZNF552,ZNF8,ZNF549,ZNF547,ZNF304,ZSCAN4,ZNF544,ZNF772,ZNF671,ZNF329,ZSCAN22,UBE2M,ZNF417,TRIM28,ZNF418,ZSCAN1,ZNF587,CHMP2A,MZF1,ZNF586,ZNF274,ZNF132,ZNF550,ZNF587B,ZNF497,ZNF749,ZIK1,ZNF446,VN1R1,ZNF776,ZNF154,ZNF256,ZNF773,ZNF586,ZNF416,ZNF584,ZNF837,TRAPPC2P1
2: ASAH2,SGMS1,CSTF2T,AGAP6,PRKG1,NCOA4,FAM21A,PRKG1,TIMM23,ASAH2B,MSMB
3: KLRB1,CLEC1A,NECAP1,ZNF705A,AICDA,A2ML1,CLEC4A,PHC1,FAM90A1,MFAP5,PZP,RIMKLB,KLRG1,CLEC4E,CLEC1B,CLEC4D,CLEC9A,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC12B,CLEC12A,CLEC2B,CLEC2A,M6PR
dput(head(MutationsNumberTable_Neighbours))
structure(list(HGNC = c("A1BG", "A1CF", "A2M", "A2ML1", "A3GALT2",
"A4GALT"), ACC = c(1L, 1L, 0L, 2L, 0L, 0L), BLCA = c(5L, 5L,
20L, 19L, 1L, 2L), BRCA = c(2L, 4L, 9L, 14L, 0L, 1L), CESC = c(1L,
4L, 9L, 10L, 1L, 5L), CHOL = c(0L, 1L, 0L, 0L, 0L, 0L), `COAD/READ_MSI` = c(1L,
3L, 7L, 10L, 3L, 3L), `COAD/READ_POLE` = c(5L, 8L, 23L, 34L,
3L, 4L), `COAD/READ_REST` = c(2L, 0L, 9L, 9L, 0L, 2L), DLBC = c(0L,
0L, 1L, 1L, 0L, 0L), ESCA = c(0L, 0L, 6L, 2L, 1L, 2L), GBM = c(2L,
4L, 9L, 4L, 2L, 4L), HNSC = c(7L, 13L, 12L, 8L, 0L, 1L), KICH = c(0L,
0L, 0L, 0L, 0L, 0L), KIRC = c(1L, 2L, 6L, 3L, 0L, 0L), KIRP = c(0L,
0L, 3L, 1L, 0L, 0L), LGG = c(2L, 4L, 5L, 3L, 0L, 0L), LIHC = c(3L,
5L, 6L, 7L, 0L, 1L), LUAD_HEAVY = c(4L, 6L, 18L, 17L, 1L, 0L),
LUAD_REST = c(3L, 5L, 5L, 15L, 0L, 0L), LUSC_HEAVY = c(1L,
5L, 7L, 6L, 0L, 0L), LUSC_REST = c(3L, 16L, 22L, 21L, 0L,
1L), MESO = c(0L, 0L, 0L, 1L, 0L, 0L), OV = c(0L, 1L, 0L,
1L, 0L, 0L), PAAD = c(3L, 2L, 2L, 5L, 2L, 0L), PCPG = c(0L,
0L, 1L, 0L, 0L, 0L), PRAD = c(2L, 6L, 4L, 1L, 0L, 4L), SARC = c(0L,
2L, 6L, 0L, 0L, 2L), SKCM_HIGH = c(2L, 58L, 47L, 51L, 2L,
17L), SKCM_LOW = c(2L, 15L, 16L, 23L, 0L, 5L), STAD = c(7L,
11L, 40L, 26L, 3L, 3L), TGCT = c(0L, 0L, 1L, 0L, 0L, 0L),
THCA = c(0L, 0L, 1L, 0L, 0L, 0L), THYM = c(0L, 0L, 0L, 0L,
0L, 1L), UCEC_MSI = c(6L, 0L, 2L, 1L, 0L, 2L), UCEC_POLE = c(14L,
41L, 72L, 73L, 4L, 27L), UCEC_REST = c(1L, 0L, 6L, 9L, 0L,
1L), UCS = c(0L, 1L, 1L, 2L, 0L, 0L), UVM_HIGH = c(0L, 0L,
0L, 0L, 0L, 0L), UVM_LOW = c(0L, 0L, 0L, 0L, 0L, 0L), Neighbors = c("ZNF134,ZNF606,SLC27A5,ZNF419,ZSCAN18,ZNF814,C19orf18,RPS5,ZNF548,ZNF135,ZNF324,ZBTB45,ZNF17,ZNF530,ZNF211,ZNF551,ZNF324B,ZNF552,ZNF8,ZNF549,ZNF547,ZNF304,ZSCAN4,ZNF544,ZNF772,ZNF671,ZNF329,ZSCAN22,UBE2M,ZNF417,TRIM28,ZNF418,ZSCAN1,ZNF587,CHMP2A,MZF1,ZNF586,ZNF274,ZNF132,ZNF550,ZNF587B,ZNF497,ZNF749,ZIK1,ZNF446,VN1R1,ZNF776,ZNF154,ZNF256,ZNF773,ZNF586,ZNF416,ZNF584,ZNF837,TRAPPC2P1",
"ASAH2,SGMS1,CSTF2T,AGAP6,PRKG1,NCOA4,FAM21A,PRKG1,TIMM23,ASAH2B,MSMB",
"KLRB1,CLEC1A,NECAP1,ZNF705A,AICDA,A2ML1,CLEC4A,PHC1,FAM90A1,MFAP5,PZP,RIMKLB,KLRG1,CLEC4E,CLEC1B,CLEC4D,CLEC9A,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC12B,CLEC12A,CLEC2B,CLEC2A,M6PR",
"KLRB1,NECAP1,ZNF705A,AICDA,CLEC4A,PHC1,FAM90A1,C3AR1,MFAP5,PZP,RIMKLB,A2M,KLRG1,CLEC4E,SLC2A3,CLEC4D,SLC2A14,FOXJ2,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC2B,M6PR",
"PHC2,HDAC1,KIAA1522,ZSCAN20,RBBP4,HMGB4,C1orf94,ADC,S100PBP,ZNF362,BSDC1,AK2,YARS,TMEM54,ZBTB8A,RNF19B,TSSK3,TRIM62,SYNC,HPCA,MARCKSL1,ZBTB8B,ZBTB8OS,FNDC5",
"PACSIN2,C22orf46,MEI1,NFAM1,SCUBE1,EFCAB6,FAM109B,TCF20,SREBF2,SERHL2,NAGA,TTLL1,TSPO,POLDIP3,TNFRSF13C,BIK,TTLL12,CYP2D6,MCAT,WBP2NL,RRP7A,CYB5R3,MPPED1,ARFGAP3,SEPT3,CCDC134,NDUFA6,CENPM,SMDT1,ATP5L2"
)), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x16d2358>, sorted = "HGNC", .Names = c("HGNC",
"ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD/READ_MSI", "COAD/READ_POLE",
"COAD/READ_REST", "DLBC", "ESCA", "GBM", "HNSC", "KICH", "KIRC",
"KIRP", "LGG", "LIHC", "LUAD_HEAVY", "LUAD_REST", "LUSC_HEAVY",
"LUSC_REST", "MESO", "OV", "PAAD", "PCPG", "PRAD", "SARC", "SKCM_HIGH",
"SKCM_LOW", "STAD", "TGCT", "THCA", "THYM", "UCEC_MSI", "UCEC_POLE",
"UCEC_REST", "UCS", "UVM_HIGH", "UVM_LOW", "Neighbors"))
Я хотел бы получить новую таблицу MutationsNumberTable_Neighbours_summarized
, где у меня будет количество мутаций для каждойген, включая его соседей.Мне удалось создать простой код:
cancers = colnames(MutationsNumberTable_Neighbours) %>% setdiff(c("HGNC", "Neighbors"))
SummarizeMutationsNeighbours = function(gene){
Neighbours = c(gene, unlist(strsplit(MutationsNumberTable_Neighbours$Neighbors[MutationsNumberTable_Neighbours$HGNC == gene], ",")))
MutTable_geneNeighb = MutationsNumberTable_Neighbours[MutationsNumberTable_Neighbours$HGNC %in% Neighbours, ..cancers]
MutTable_geneNeighb = plyr::numcolwise(sum)(MutTable_geneNeighb)
MutTable_geneNeighb$Gene = gene
return((MutTable_geneNeighb))
}
library(pbapply)
MutationsNumberTable_Neighbours_summarized = pblapply(MutationsNumberTable_Neighbours$HGNC, SummarizeMutationsNeighbours)
MutationsNumberTable_Neighbours_summarized = rbindlist(MutationsNumberTable_Neighbours_summarized)
Я считаю, что это не оптимальное решение (по моим данным, это займет 15 часов).Однако подход dplyr
, который я придумал, ничего не дает:
MutationsNumberTable_Neighbours_summ = MutationsNumberTable_Neighbours %>% group_by(HGNC) %>% mutate(N = paste(HGNC, Neighbors, sep=","))
MutationsNumberTable_Neighbours_summ = MutationsNumberTable_Neighbours_summ %>% filter(HGNC %in% unlist(strsplit(N, ","))) %>% summarise_at(cancers, sum)