Как извлечь строку символов в r - PullRequest
1 голос
/ 13 февраля 2020

Я хотел бы извлечь семейный уровень и ID # для каждого таксона в некоторых данных последовательности. Последующие анализы требуют пакетов picante и phyloseq, но в этом вопросе они не понадобятся. В приведенном ниже примере я хотел бы:

  1. Уменьшить V1 с «000000001 | размер: 678798» и «000000009 | размер: 1497» до «1» и «9»
  2. Извлеките «Endozoicimonaceae» и «Synechococcaceae» в новую колонку
    a <- structure(list(V1 = structure(1:2, .Label = 
    c("000000001|size:678798", "000000009|size:1497", 
    "000000019|size:66182", "000000020|size:4043", "000000025|size:549", 
    "000000030|size:1689", "000000035|size:655", "000000036|size:718", 
    "000000041|size:431", "000000047|size:3312", "000000054|size:584", 
    "000000065|size:376", "000000069|size:2341", "000000085|size:771", 
    "000000091|size:296", "000000095|size:462", "000000107|size:378", 
    "000000108|size:612", "000000116|size:319", "000000121|size:1355", 
    "000000126|size:256", "000000270|size:10342", "000000274|size:293", 
    "000000299|size:154434", "000000301|size:35432", 
    "000000305|size:440", 
    "000000311|size:285", "000000342|size:760", "000000344|size:474", 
    "000000365|size:450", "000000368|size:4805", "000000369|size:337", 
    "000000398|size:1217", "000000399|size:7457", "000000401|size:1426", 
    "000000418|size:724", "000000419|size:1201", "000000423|size:1173", 
    "000000445|size:252", "000000487|size:1142", "000000488|size:5676", 
    "000000509|size:6883", "000000545|size:345", "000000546|size:419", 
    "000000586|size:518", "000000606|size:448", "000000607|size:3070", 
    "000000610|size:894", "000000624|size:52798", 
    "000000625|size:19114", 
    "000000628|size:1808", "000000630|size:20151", 
    "000000668|size:2021", 
    "000000669|size:1193", "000000680|size:1287", "000000683|size:305", 
    "000000685|size:265", "000000694|size:6939", "000000695|size:333", 
    "000000697|size:634", "000000716|size:24392", "000000719|size:760", 
    "000000760|size:357", "000000762|size:661", "000000788|size:837", 
    "000000789|size:368", "000000792|size:1075", 
    "000000793|size:2143"), class = "factor"), V2 = structure(c(28L, 
    11L), .Label = c("k__Bacteria; p__Actinobacteria; c__Actinobacteria; 
    o__Actinomycetales; f__Corynebacteriaceae; g__Corynebacterium; s__", 
    "k__Bacteria; p__Actinobacteria; c__Actinobacteria; 
    o__Actinomycetales; f__Micrococcaceae; g__Micrococcus; s__", 
    "k__Bacteria; p__Bacteroidetes; c__Cytophagia; o__Cytophagales; 
    f__[Amoebophilaceae]; g__SGUS912; s__", "k__Bacteria; 
    p__Bacteroidetes; c__Flavobacteriia; o__Flavobacteriales; 
    f__Flavobacteriaceae; g__; s__", "k__Bacteria; p__Cyanobacteria; 
    c__Chloroplast; o__; f__; g__; s__", "k__Bacteria; p__Cyanobacteria; 
    c__Chloroplast; o__CAB-I; f__; g__; s__", "k__Bacteria; 
    p__Cyanobacteria; c__Chloroplast; o__Chlorophyta; f__Ulvophyceae; 
    g__; 
    s__", "k__Bacteria; p__Cyanobacteria; c__Chloroplast; 
    o__Stramenopiles; f__; g__; s__", "k__Bacteria; p__Cyanobacteria; 
    c__Nostocophycideae; o__Stigonematales; f__Rivulariaceae; 
    g__Rivularia; s__", "k__Bacteria; p__Cyanobacteria; 
    c__Synechococcophycideae; o__Pseudanabaenales; f__Pseudanabaenaceae; 
    g__; s__", "k__Bacteria; p__Cyanobacteria; c__Synechococcophycideae; 
    o__Synechococcales; f__Synechococcaceae; g__Synechococcus; s__", 
    "k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; 
    f__Bacillaceae", "k__Bacteria; p__Firmicutes; c__Bacilli; 
    o__Bacillales; f__Bacillaceae; g__Bacillus; s__", "k__Bacteria; 
    p__Firmicutes; c__Bacilli; o__Bacillales; f__Bacillaceae; 
    g__Bacillus; 
    s__firmus", "k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; 
    f__Bacillaceae; g__Geobacillus; s__", "k__Bacteria; p__Firmicutes; 
    c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus; 
    s__", "k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; 
    f__Lactobacillaceae; g__Lactobacillus; s__iners", "k__Bacteria; 
    p__Proteobacteria; c__Alphaproteobacteria; o__Caulobacterales; 
    f__Caulobacteraceae; g__; s__", "k__Bacteria; p__Proteobacteria; 
    c__Alphaproteobacteria; o__Caulobacterales; f__Caulobacteraceae; 
    g__Brevundimonas; s__diminuta", "k__Bacteria; p__Proteobacteria; 
    c__Alphaproteobacteria; o__Rhizobiales; f__Rhizobiaceae; 
    g__Agrobacterium; s__", "k__Bacteria; p__Proteobacteria; 
    c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; 
    g__; 
    s__", "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; 
    o__Rhodobacterales; f__Rhodobacteraceae; g__Paracoccus; s__", 
    "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; 
    o__Rhodospirillales; f__Acetobacteraceae; g__; s__", "k__Bacteria; 
    p__Proteobacteria; c__Betaproteobacteria; o__Neisseriales; 
    f__Neisseriaceae; g__; s__", "k__Bacteria; p__Proteobacteria; 
    c__Deltaproteobacteria; o__Myxococcales; f__; g__; s__", 
    "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; 
    o__Alteromonadales; f__Alteromonadaceae; g__Alteromonas; s__", 
    "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; 
    o__Alteromonadales; f__Alteromonadaceae; g__Spongiibacter; s__", 
    "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; 
    o__Oceanospirillales; f__Endozoicimonaceae; g__; s__", 
    "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; 
    o__Pseudomonadales; f__Moraxellaceae; g__Acinetobacter", 
    "k__Bacteria; 
    p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; 
    f__Moraxellaceae; g__Acinetobacter; s__", "k__Bacteria; 
    p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; 
    f__Moraxellaceae; g__Acinetobacter; s__rhizosphaerae", "k__Bacteria; 
    p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; 
    f__Moraxellaceae; g__Enhydrobacter; s__", "k__Bacteria; 
    p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; 
    f__Pseudomonadaceae; g__; s__", "k__Bacteria; p__Proteobacteria; 
    c__Gammaproteobacteria; o__Vibrionales; f__Pseudoalteromonadaceae; 
    g__Pseudoalteromonas; s__", "k__Bacteria; p__Proteobacteria; 
    c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; 
    g__Luteimonas; s__", "k__Bacteria; p__Verrucomicrobia; 
    c__Verrucomicrobiae; o__Verrucomicrobiales; f__Verrucomicrobiaceae; 
    g__Rubritalea; s__", "Unassigned"), class = "factor"), V3 = c(1, 1), 
    V4 = c(3L, 3L)), row.names = 1:2, class = "data.frame")

Ответы [ 2 ]

1 голос
/ 13 февраля 2020

Мы можем использовать parse_number для извлечения первого числа c значения из 'V1' и str_extract для извлечения специфических c подстрок / слов из 'V2' или, если это из шаблона, т.е. указать регулярное выражение поиска для поиска слов (\\w+), которые успешно после f__

library(stringr)
library(dplyr)
a %>%
   mutate(V1 = readr::parse_number(as.character(V1)),
          new = str_extract(V2, "(?<=f__)\\w+"))
# V1
#1  1
#2  9
                                                                                                                             #V2
#1            k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; \n    o__Oceanospirillales; f__Endozoicimonaceae; g__; s__
#2 k__Bacteria; p__Cyanobacteria; c__Synechococcophycideae; \n    o__Synechococcales; f__Synechococcaceae; g__Synechococcus; s__
#  V3 V4               new
#1  1  3 Endozoicimonaceae
#2  1  3  Synechococcaceae
1 голос
/ 13 февраля 2020

Делает ли это то, что вы хотите?

  1. strplit

      strsplit(as.character(a$V1), "\\|")
    
  2. grep

    a$Bacteria <- NA
    a$Bacteria[grep("Endozoicimonaceae", a$V2)] <- "Endozoicimonaceae"
    a$Bacteria[grep("Synechococcaceae", a$V2)] <- "Synechococcaceae"
    
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...