Итерация по столбцам, выполняющим group_by и присваивание соседей - PullRequest
0 голосов
/ 13 мая 2019

Возможно, это распространенный вопрос, но я не смог найти ответ, способный вписаться в мою проблему.

У меня есть данные, в которых есть столбец, в котором некоторые строки назначены, а другие - NA.
Случаи NA могут быть назначены, если он принадлежит группе, которой назначен хотя бы один элемент.
Эти данные имеют другие столбцы для группировки, и этот столбец имеет иерархию, это означает, что мы должны сгруппировать каждый столбецодин раз в определенном порядке, преимущество в том, что эти групповые столбцы являются числовыми, начиная с 1 до 0.2.
. Я думаю, это своего рода алгоритм глубины

structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335", 
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335", 
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335", 
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337", 
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332", 
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332", 
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331", 
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334", 
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321", 
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326", 
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324", 
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338", 
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337", 
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348", 
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340", 
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343", 
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344", 
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331", 
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339", 
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337", 
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340", 
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE), ligandId = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "CL MSE RIB", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "RIB", NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL TLZ", NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL MSE"), `1` = c(138240, 
50687, 1378, 126023, 237469, 124229, 41453, 217657, 91126, 183150, 
107179, 6274, 262561, 176386, 148744, 74195, 93951, 264634, 95933, 
96095, 183149, 188356, 250907, 252918, 243804, 66122, 207410, 
87437, 181530, 187508, 16752, 81323, 265938, 58828, 13021, 638, 
638, 28566, 4878, 89140, 122445, 155228, 125955, 164812, 120428, 
55374, 257779, 219988, 170251, 116415, 120125, 14694, 208562, 
78746, 116429, 115371, 21301, 27608, 88975, 166418, 252468, 197645, 
123797, 220728, 220653, 182317, 183297), `0.9` = c(115828, 171070, 
47923, 71525, 156529, 98825, 165289, 189247, 31455, 31455, 88210, 
61492, 196793, 131804, 143158, 10327, 10327, 10327, 26521, 26521, 
31058, 31058, 123582, 121348, 186492, 34988, 162176, 25258, 25258, 
138442, 15902, 15902, 196359, 62175, 46251, 147381, 147381, 56135, 
59451, 76614, 102076, 130255, 71453, 114643, 96504, 58676, 197451, 
192775, 114280, 23991, 23991, 53944, 190660, 69970, 97189, 94703, 
43808, 33373, 76685, 111928, 125148, 137147, 98782, 193109, 193143, 
26538, 26538), `0.8` = c(110007, 126908, 41980, 81956, 146463, 
7523, 7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337, 
144486, 68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235, 
93313, 91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172, 
141549, 44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802, 
119491, 82179, 108083, 39098, 131355, 142185, 137368, 106528, 
8338, 8338, 8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429, 
105638, 23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870, 
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533, 
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130, 
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437, 
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402, 
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867, 
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470, 
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718, 
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974, 
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944, 
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954, 
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955, 
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836, 
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467, 
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568, 
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283, 
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445, 
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080, 
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047, 
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093, 
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304, 
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941, 
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -67L))

или


Чтобы решить эту проблему, я написал функцию и вызывал ее много раз.

Что бы я хотел сделать, не вызывая ее много раз и, если возможно, безнеобходимость написать функцию.

Мое решение работает, но тратит много времени на обработку.

assingn_lig <- function(data, group_col){
  data %>%
    dplyr::group_by(!!rlang::ensym(group_col)) %>%
    mutate( 
      ligandId = case_when(
        PDB == TRUE ~ ligandId ,
        !is.na(ligandId) ~ ligandId ,
        any( PDB == T ) & is.na(ligandId) ~ paste(unique(unlist(str_split(na.omit(ligandId), " ")) ), collapse = " "),
        TRUE ~ NA_character_  
      )
    ) %>%
    ungroup()
}

tmp2 %>%
  assingn_lig(group_col = `1`) %>%
  assingn_lig(group_col = `0.9`) %>%
  assingn_lig(group_col = `0.8`) %>%
  assingn_lig(group_col = `0.7`) %>%
  assingn_lig(group_col = `0.6`) %>%
  assingn_lig(group_col = `0.5`) %>%
  assingn_lig(group_col = `0.4`) %>%
  assingn_lig(group_col = `0.3`) %>%
  assingn_lig(group_col = `0.2`)

Итак, кто-нибудь знает лучшую стратегию?

Этовывод:

# A tibble: 67 x 12
   ID                    PDB   ligandId      `1`  `0.9`  `0.8`  `0.7` `0.6` `0.5` `0.4` `0.3` `0.2`
   <chr>                 <lgl> <chr>       <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 WP_012391491.1/58-334 FALSE CL MSE RIB 138240 115828 110007  67870  8366   568  2884     9     1
 2 WP_045025307.1/57-335 FALSE CL MSE RIB  50687 171070 126908 102232  8366   568  2884     9     1
 3 WP_065911868.1/57-334 FALSE CL MSE RIB   1378  47923  41980  35151 23618   568  2884     9     1
 4 WP_094130548.1/57-334 FALSE CL MSE RIB 126023  71525  81956  42345 32642   568  2884     9     1
 5 WP_041093274.1/57-335 FALSE CL MSE RIB 237469 156529 146463  92357 70323   568  2884     9     1
 6 WP_087741863.1/58-335 FALSE CL MSE RIB 124229  98825   7523  99040 78974   568  2884     9     1
 7 WP_048735837.1/58-335 FALSE CL MSE RIB  41453 165289   7523  99040 78974   568  2884     9     1
 8 WP_024526760.1/58-335 FALSE CL MSE RIB 217657 189247   7523  99040 78974   568  2884     9     1
 9 YP_006375059.1/60-339 FALSE CL MSE RIB  91126  31455  13169   6533  1269 28283  2884     9     1
10 4RK1_A                TRUE  CL MSE RIB 183150  31455  13169   6533  1269 28283  2884     9     1
# ? with 57 more rows
> dput(tmp1)
structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335", 
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335", 
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335", 
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337", 
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332", 
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332", 
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331", 
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334", 
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321", 
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326", 
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324", 
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338", 
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337", 
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348", 
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340", 
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343", 
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344", 
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331", 
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339", 
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337", 
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340", 
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE), ligandId = c("CL MSE RIB", "CL MSE RIB", "CL MSE RIB", 
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", 
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", 
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", 
"CL MSE RIB", "CL MSE RIB", "RIB", "RIB", "CL MSE RIB", "CL MSE RIB", 
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", 
"GOL MSE", "GOL MSE"), `1` = c(138240, 50687, 1378, 126023, 237469, 
124229, 41453, 217657, 91126, 183150, 107179, 6274, 262561, 176386, 
148744, 74195, 93951, 264634, 95933, 96095, 183149, 188356, 250907, 
252918, 243804, 66122, 207410, 87437, 181530, 187508, 16752, 
81323, 265938, 58828, 13021, 638, 638, 28566, 4878, 89140, 122445, 
155228, 125955, 164812, 120428, 55374, 257779, 219988, 170251, 
116415, 120125, 14694, 208562, 78746, 116429, 115371, 21301, 
27608, 88975, 166418, 252468, 197645, 123797, 220728, 220653, 
182317, 183297), `0.9` = c(115828, 171070, 47923, 71525, 156529, 
98825, 165289, 189247, 31455, 31455, 88210, 61492, 196793, 131804, 
143158, 10327, 10327, 10327, 26521, 26521, 31058, 31058, 123582, 
121348, 186492, 34988, 162176, 25258, 25258, 138442, 15902, 15902, 
196359, 62175, 46251, 147381, 147381, 56135, 59451, 76614, 102076, 
130255, 71453, 114643, 96504, 58676, 197451, 192775, 114280, 
23991, 23991, 53944, 190660, 69970, 97189, 94703, 43808, 33373, 
76685, 111928, 125148, 137147, 98782, 193109, 193143, 26538, 
26538), `0.8` = c(110007, 126908, 41980, 81956, 146463, 7523, 
7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337, 144486, 
68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235, 93313, 
91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172, 141549, 
44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802, 119491, 
82179, 108083, 39098, 131355, 142185, 137368, 106528, 8338, 8338, 
8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429, 105638, 
23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870, 
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533, 
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130, 
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437, 
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402, 
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867, 
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470, 
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718, 
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974, 
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944, 
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954, 
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955, 
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836, 
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467, 
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568, 
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283, 
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445, 
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080, 
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047, 
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093, 
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304, 
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941, 
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -67L))

Любая помощь приветствуется.Заранее спасибо.

...