У меня есть очень большие файлы, такие как
> dput(head(big))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01", "LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr1", "chr1", "chr1", "chr1", "chr1"), Start_Position = c(782112,
1026918, 1133283, 1431511, 1742395, 1864994), Reference_Allele = c("G",
"C", "C", "G", "C", "G"), Tumor_Seq_Allele2 = c("A", "T", "T",
"A", "T", "A")), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")
>
> dim(big)
[1] 555437 5
>
Это включает хромосомную позицию мутаций
Для тех же образцов у меня есть меньший фрейм данных, например
> dput(head(small))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr6", "chr16"), Start_Position = c("89616151", "51909815",
"20556547"), End_Position = c("89616151", "51909815", "20556547"
), Reference_Allele = c("-", "A", "-"), Tumor_Seq_Allele2 = c("T",
"-", "G"), Hugo_Symbol = c("GBP7", "PKHD1", "ACSM2B"), Variant_Classification = c("Frame_Shift_Ins",
"Frame_Shift_Del", "Frame_Shift_Ins"), tx = c("NM_207398", "NM_138694",
"NM_001105069"), exon = c("exon6", "exon25", "exon10"), txChange = c("c.732dupA",
"c.2664delT", "c.1212dupC"), aaChange = c("p.L245fs", "p.F888fs",
"p.G405fs"), Variant_Type = c("INS", "DEL", "INS"), Func.refGene = c("exonic",
"exonic", "exonic"), Gene.refGene = c("GBP7", "PKHD1", "ACSM2B"
), GeneDetail.refGene = c(".", ".", "."), ExonicFunc.refGene = c("frameshift insertion",
"frameshift deletion", "frameshift insertion"), AAChange.refGene = c("GBP7:NM_207398:exon6:c.732dupA:p.L245fs",
"PKHD1:NM_138694:exon25:c.2664delT:p.F888fs,PKHD1:NM_170724:exon25:c.2664delT:p.F888fs",
"ACSM2B:NM_001105069:exon10:c.1212dupC:p.G405fs,ACSM2B:NM_182617:exon11:c.1212dupC:p.G405fs"
), Func.knownGene = c("exonic", "exonic", "exonic"), Gene.knownGene = c("GBP7",
"PKHD1", "ACSM2B"), GeneDetail.knownGene = c(".", ".", "."),
ExonicFunc.knownGene = c("frameshift insertion", "frameshift deletion",
"frameshift insertion"), AAChange.knownGene = c("GBP7:uc001dna.2:exon6:c.732dupA:p.L245fs",
"PKHD1:uc003pah.1:exon25:c.2664delT:p.F888fs,PKHD1:uc003pai.3:exon25:c.2664delT:p.F888fs",
"ACSM2B:uc002dhk.4:exon10:c.1212dupC:p.G405fs,ACSM2B:uc010bwf.1:exon10:c.1212dupC:p.G405fs,ACSM2B:uc002dhj.4:exon11:c.1212dupC:p.G405fs"
), avsnp147 = c("rs200922375", ".", "rs750935206"), `1000g2015aug_all` = c("0.00738818",
".", "."), `1000g2015aug_amr` = c("0.0014", ".", "."), `1000g2015aug_sas` = c(".",
".", "."), `1000g2015aug_eur` = c(".", ".", "."), `1000g2015aug_eas` = c(".",
".", "."), esp6500siv2_all = c("0.0082", ".", "."), esp6500siv2_ea = c(".",
".", "."), esp6500siv2_aa = c("0.0242", ".", "."), ExAC_ALL = c("0.0021",
".", "8.254e-06"), ExAC_AFR = c("0.0239", ".", "0"), ExAC_AMR = c("0.0004",
".", "0"), ExAC_EAS = c("0.0001", ".", "0"), ExAC_FIN = c("0",
".", "0"), ExAC_NFE = c("1.499e-05", ".", "1.502e-05"), ExAC_OTH = c("0",
".", "0"), ExAC_SAS = c("0", ".", "0"), cosmic70 = c(".",
".", "."), SIFT_score = c(".", ".", "."), SIFT_converted_rankscore = c(".",
".", "."), SIFT_pred = c(".", ".", "."), Polyphen2_HDIV_score = c(".",
".", "."), Polyphen2_HDIV_rankscore = c(".", ".", "."), Polyphen2_HDIV_pred = c(".",
".", "."), Polyphen2_HVAR_score = c(".", ".", "."), Polyphen2_HVAR_rankscore = c(".",
".", "."), Polyphen2_HVAR_pred = c(".", ".", "."), LRT_score = c(".",
".", "."), LRT_converted_rankscore = c(".", ".", "."), LRT_pred = c(".",
".", "."), MutationTaster_score = c(".", ".", "."), MutationTaster_converted_rankscore = c(".",
".", "."), MutationTaster_pred = c(".", ".", "."), MutationAssessor_score = c(".",
".", "."), MutationAssessor_score_rankscore = c(".", ".",
"."), MutationAssessor_pred = c(".", ".", "."), FATHMM_score = c(".",
".", "."), FATHMM_converted_rankscore = c(".", ".", "."),
FATHMM_pred = c(".", ".", "."), PROVEAN_score = c(".", ".",
"."), PROVEAN_converted_rankscore = c(".", ".", "."), PROVEAN_pred = c(".",
".", "."), VEST3_score = c(".", ".", "."), VEST3_rankscore = c(".",
".", "."), MetaSVM_score = c(".", ".", "."), MetaSVM_rankscore = c(".",
".", "."), MetaSVM_pred = c(".", ".", "."), MetaLR_score = c(".",
".", "."), MetaLR_rankscore = c(".", ".", "."), MetaLR_pred = c(".",
".", "."), `M-CAP_score` = c(".", ".", "."), `M-CAP_rankscore` = c(".",
".", "."), `M-CAP_pred` = c(".", ".", "."), CADD_raw = c(".",
".", "."), CADD_raw_rankscore = c(".", ".", "."), CADD_phred = c(".",
".", "."), DANN_score = c(".", ".", "."), DANN_rankscore = c(".",
".", "."), `fathmm-MKL_coding_score` = c(".", ".", "."),
`fathmm-MKL_coding_rankscore` = c(".", ".", "."), `fathmm-MKL_coding_pred` = c(".",
".", "."), Eigen_coding_or_noncoding = c(".", ".", "."),
`Eigen-raw` = c(".", ".", "."), `Eigen-PC-raw` = c(".", ".",
"."), GenoCanyon_score = c(".", ".", "."), GenoCanyon_score_rankscore = c(".",
".", "."), integrated_fitCons_score = c(".", ".", "."), integrated_fitCons_score_rankscore = c(".",
".", "."), integrated_confidence_value = c(".", ".", "."),
`GERP++_RS` = c(".", ".", "."), `GERP++_RS_rankscore` = c(".",
".", "."), phyloP100way_vertebrate = c(".", ".", "."), phyloP100way_vertebrate_rankscore = c(".",
".", "."), phyloP20way_mammalian = c(".", ".", "."), phyloP20way_mammalian_rankscore = c(".",
".", "."), phastCons100way_vertebrate = c(".", ".", "."),
phastCons100way_vertebrate_rankscore = c(".", ".", "."),
phastCons20way_mammalian = c(".", ".", "."), phastCons20way_mammalian_rankscore = c(".",
".", "."), SiPhy_29way_logOdds = c(".", ".", "."), SiPhy_29way_logOdds_rankscore = c(".",
".", "."), Interpro_domain = c(".", ".", "."), GTEx_V6_gene = c(".",
".", "."), GTEx_V6_tissue = c(".", ".", "."), Interpro_domain = c(".",
".", "."), dbscSNV_ADA_SCORE = c(".", ".", "."), dbscSNV_RF_SCORE = c(".",
".", "."), CLINSIG = c(".", ".", "."), CLNDBN = c(".", ".",
"."), CLNACC = c(".", ".", "."), CLNDSDB = c(".", ".", "."
), CLNDSDBID = c(".", ".", "."), HRC_AF = c(".", ".", "."
), HRC_AC = c(".", ".", "."), HRC_AN = c(".", ".", "."),
HRC_non1000G_AF = c(".", ".", "."), HRC_non1000G_AC = c(".",
".", "."), HRC_non1000G_AN = c(".", ".", "."), Kaviar_AF = c("0.0020051",
".", "1.29e-05"), Kaviar_AC = c("310", ".", "2"), Kaviar_AN = c("154602",
".", "154602"), nci60 = c(".", ".", "."), TumorVAF = c("0.5375",
"0.234043", "0.413043"), NormalVAF = c("0", "0", "0"), cytoBand = c(NA_character_,
NA_character_, NA_character_), phyloP7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), phastCons7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), T_VAF = c(NA_character_, NA_character_,
NA_character_), N_VAF = c(NA_character_, NA_character_, NA_character_
), COSMIC_OESO = c("no", "no", "no"), COSMIC_FULL = c("no",
"no", "no"), RESIST = c("no", "no", "no"), FRANKEL = c("no",
"no", "no"), CANCER_GENES_DAMAGING_ALTERATION = c("no", "no",
"no"), CANCER_HELPER = c("no", "no", "no"), N_vaf = c(NA_character_,
NA_character_, NA_character_)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fd1600094e0>)
>
> dim(small)
[1] 584702 139
>
Я хочу извлечь то, что находится в небольшом файле, который является обычным для большого файла, чтобы у меня был маленький файл, вся его информация совместно использовалась с большим файлом
Я пробовал
merge(small,big)
Error in merge.data.table(small,big) :
x имеет несколько повторяющихся имен столбцов: Interpro_domain. Удалите или переименуйте дубликаты и попробуйте еще раз.
объединить (маленький, большой)
Error in merge.data.table(small, big) :
x has some duplicated column name(s): Interpro_domain. Please remove or rename the duplicate(s) and try again.
Я тоже пробовал
> subset(small, big)
Error in subset.data.table(small, big) : 'subset' must evaluate to logical
> intersect(small, big)
data frame with 0 columns and 0 rows
Пожалуйста, вы может мне помочь