Прежде всего ваши данные не аккуратны.
Шаг 1:
Избавьтесь от нескольких типов данных в столбце и переместите «раздел» в новый столбец.
require(tidyverse)
require(zoo)
dfsection <- df %>% tbl_df() %>%
mutate(section = if_else(grepl("[A-Z]",position),
as.character(position),
as.character(NA))) %>%
mutate(section = na.locf(section))
newdf <- dfsection %>%
filter(!grepl("[A-Z]",position))%>%
mutate_if(is.factor,as.character) %>%
mutate(newcol = as.numeric(newcol))
newdf
дает
# A tibble: 14 x 4
position col newcol section
<chr> <chr> <dbl> <chr>
1 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MGSLSNYALLQLTLT 1.87 TP<AMT55
2 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 GSLSNYALLQLTLTA 1.73 TP<AMT55
3 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 SLSNYALLQLTLTAF 2.73 TP<AMT55
4 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 LSNYALLQLTLTAFL 3.88 TP<AMT55
5 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MVKETTYYDVLGVKP 3.19 TP<AMB88
6 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 VKETTYYDVLGVKPN 3.19 TP<AMB88
7 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTLGRRLACLFLACV 2.97 TP<ELANE
8 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TLGRRLACLFLACVL 1.73 TP<ELANE
9 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 LGRRLACLFLACVLP 2.73 TP<ELANE
10 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 GRRLACLFLACVLPA 1.79 TP<ELANE
11 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTEQMTLRGTLKGHN 1.19 TP<RACK1
12 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TEQMTLRGTLKGHNG 3.14 TP<RACK1
13 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 EQMTLRGTLKGHNGW 1.32 TP<RACK1
14 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 QMTLRGTLKGHNGWV 3.32 TP<RACK1
Шаг 2:
Создайте функцию для нормализации данных к точке, где каждая позиция и буква является строкой в dataframe
splitstring <- function(pos,letters){
posvec <- as.numeric(str_split(pos,",")[[1]])
data.frame(pos = posvec,
letter = str_sub(letters,seq_along(posvec),seq_along(posvec)),
stringsAsFactors = F)
}
#example
splitstring(newdf[2,'position'],newdf[2,'col'])
pos letter
1 2 G
2 3 S
3 4 L
4 5 S
5 6 N
6 7 Y
7 8 A
8 9 L
9 10 L
10 11 Q
11 12 L
12 13 T
13 14 L
14 15 T
15 16 A
шаг 3:
Примените эту функцию splitstring
к вашему dataframe
.
normdf <- newdf %>%
mutate(newdat = map2(position,col,splitstring)) %>%
select(section,newdat,newcol) %>%
unnest()
normdf
дает:
# A tibble: 210 x 4
section newcol pos letter
<chr> <dbl> <dbl> <chr>
1 TP<AMT55 1.87 1 M
2 TP<AMT55 1.87 2 G
3 TP<AMT55 1.87 3 S
4 TP<AMT55 1.87 4 L
5 TP<AMT55 1.87 5 S
6 TP<AMT55 1.87 6 N
7 TP<AMT55 1.87 7 Y
8 TP<AMT55 1.87 8 A
9 TP<AMT55 1.87 9 L
10 TP<AMT55 1.87 10 L
# ... with 200 more rows
шаг 4:
Наконец, отфильтруйте буквы, связанные со значением newcol
<2. Найдите отличительные <code>letter и position
комбинаций на секцию. Объедините в одну строку.
result <- normdf %>%
filter(newcol < 2) %>%
select(-newcol) %>%
distinct() %>%
group_by(section) %>%
summarise(Newposition = paste(pos, collapse = ","),
newcol2 = paste(letter, collapse = ""))
result
дает
# A tibble: 3 x 3
section Newposition newcol2
<chr> <chr> <chr>
1 TP<AMT55 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 MGSLSNYALLQLTLTA
2 TP<ELANE 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 TLGRRLACLFLACVLPA
3 TP<RACK1 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 MTEQMTLRGTLKGHNGW
Шаг 5:
Ссылка на неопрятные данные
dfsection %>% mutate(link = str_sub(position,1,1)) %>%
left_join(result %>% mutate(link = str_sub(Newposition,1,1)) %>%
select(link,section,Newposition,newcol2), by = c('section','link')) %>%
select(-link,-section) %>%
mutate_at(vars(Newposition,newcol2),coalesce,'') %>%
as.data.frame
дает:
position col newcol Newposition
1 TP<AMT55 TP<AMT55 TP<AMT55
2 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MGSLSNYALLQLTLT 1.866247897 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
3 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 GSLSNYALLQLTLTA 1.732914564
4 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 SLSNYALLQLTLTAF 2.732914564
5 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 LSNYALLQLTLTAFL 3.87645
6 TP<AMB88 TP<AMB88 TP<AMB88
7 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MVKETTYYDVLGVKP 3.189898095
8 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 VKETTYYDVLGVKPN 3.189898095
9 TP<ELANE TP<ELANE TP<ELANE
10 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTLGRRLACLFLACV 2.973557262
11 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TLGRRLACLFLACVL 1.732914564 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
12 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 LGRRLACLFLACVLP 2.732914564
13 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 GRRLACLFLACVLPA 1.789898095
14 TP<RACK1 TP<RACK1 TP<RACK1
15 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTEQMTLRGTLKGHN 1.189898095 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
16 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TEQMTLRGTLKGHNG 3.139572262
17 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 EQMTLRGTLKGHNGW 1.323231429
18 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 QMTLRGTLKGHNGWV 3.323231429
newcol2
1
2 MGSLSNYALLQLTLTA
3
4
5
6
7
8
9
10
11 TLGRRLACLFLACVLPA
12
13
14
15 MTEQMTLRGTLKGHNGW
16
17
18