library(tidyverse)
dat <- read_delim(
"Kingdom | Phylum | Class | Order
Bacteria | Firmicutes | Negativicutes | Selenomonadales
Bacteria | Bact_unclassified | Bact_unclassified | Bact_unclassified
Bacteria | Firmicutes | Negativicutes | Negativ_unclassified
Archaea | Euryarchaeota | Methanobacteria | Methanobacteriales
Archaea | Euryarchaeota | Eury_unclassified | Eury_unclassified
", delim = "|", trim_ws = TRUE)
priority = c(Order = 1L, Class = 2L, Phylum = 3L, Kingdom = 4L)
dat %>%
mutate(id = row_number()) %>%
gather(variable, value, -id) %>%
mutate(priority = priority[variable]) %>%
arrange(id, priority) %>%
group_by(id) %>%
slice(detect_index(value, Negate(grepl), pattern = "unclassified$")) %>%
mutate(Output = paste(tolower(substr(variable, 1, 1)), value, sep = "_"))
# # A tibble: 5 x 5
# # Groups: id [5]
# id variable value priority Output
# <int> <chr> <chr> <int> <chr>
# 1 1 Order Selenomonadales 1 o_Selenomonadales
# 2 2 Kingdom Bacteria 4 k_Bacteria
# 3 3 Class Negativicutes 2 c_Negativicutes
# 4 4 Order Methanobacteriales 1 o_Methanobacteriales
# 5 5 Phylum Euryarchaeota 3 p_Euryarchaeota
Другой способ использования coalesce
:
dat %>%
imap_dfc(~ paste(tolower(substr(.y, 1, 1)), .x, sep = "_")) %>%
mutate_all(function(x) ifelse(grepl("unclassified$", x), NA, x)) %>%
mutate(Output = coalesce(Order, Class, Phylum, Kingdom))
# # A tibble: 5 x 5
# Kingdom Phylum Class Order Output
# <chr> <chr> <chr> <chr> <chr>
# 1 k_Bacteria p_Firmicutes c_Negativicutes o_Selenomonadales o_Selenomonadales
# 2 k_Bacteria NA NA NA k_Bacteria
# 3 k_Bacteria p_Firmicutes c_Negativicutes NA c_Negativicutes
# 4 k_Archaea p_Euryarchaeota c_Methanobacteria o_Methanobacteriales o_Methanobacteriales
# 5 k_Archaea p_Euryarchaeota NA NA p_Euryarchaeota