Возможно, это распространенный вопрос, но я не смог найти ответ, способный вписаться в мою проблему.
У меня есть данные, в которых есть столбец, в котором некоторые строки назначены, а другие - NA.
Случаи NA могут быть назначены, если он принадлежит группе, которой назначен хотя бы один элемент.
Эти данные имеют другие столбцы для группировки, и этот столбец имеет иерархию, это означает, что мы должны сгруппировать каждый столбецодин раз в определенном порядке, преимущество в том, что эти групповые столбцы являются числовыми, начиная с 1
до 0.2
.
. Я думаю, это своего рода алгоритм глубины
structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335",
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335",
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335",
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337",
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332",
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332",
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331",
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334",
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321",
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326",
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324",
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338",
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337",
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348",
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340",
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343",
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344",
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331",
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339",
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337",
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340",
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE), ligandId = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "CL MSE RIB",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "RIB", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL TLZ", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL MSE"), `1` = c(138240,
50687, 1378, 126023, 237469, 124229, 41453, 217657, 91126, 183150,
107179, 6274, 262561, 176386, 148744, 74195, 93951, 264634, 95933,
96095, 183149, 188356, 250907, 252918, 243804, 66122, 207410,
87437, 181530, 187508, 16752, 81323, 265938, 58828, 13021, 638,
638, 28566, 4878, 89140, 122445, 155228, 125955, 164812, 120428,
55374, 257779, 219988, 170251, 116415, 120125, 14694, 208562,
78746, 116429, 115371, 21301, 27608, 88975, 166418, 252468, 197645,
123797, 220728, 220653, 182317, 183297), `0.9` = c(115828, 171070,
47923, 71525, 156529, 98825, 165289, 189247, 31455, 31455, 88210,
61492, 196793, 131804, 143158, 10327, 10327, 10327, 26521, 26521,
31058, 31058, 123582, 121348, 186492, 34988, 162176, 25258, 25258,
138442, 15902, 15902, 196359, 62175, 46251, 147381, 147381, 56135,
59451, 76614, 102076, 130255, 71453, 114643, 96504, 58676, 197451,
192775, 114280, 23991, 23991, 53944, 190660, 69970, 97189, 94703,
43808, 33373, 76685, 111928, 125148, 137147, 98782, 193109, 193143,
26538, 26538), `0.8` = c(110007, 126908, 41980, 81956, 146463,
7523, 7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337,
144486, 68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235,
93313, 91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172,
141549, 44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802,
119491, 82179, 108083, 39098, 131355, 142185, 137368, 106528,
8338, 8338, 8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429,
105638, 23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870,
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533,
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130,
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437,
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402,
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867,
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470,
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718,
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974,
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269,
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944,
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954,
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955,
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836,
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467,
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568,
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283,
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445,
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080,
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047,
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093,
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304,
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941,
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -67L))
или
Чтобы решить эту проблему, я написал функцию и вызывал ее много раз.
Что бы я хотел сделать, не вызывая ее много раз и, если возможно, безнеобходимость написать функцию.
Мое решение работает, но тратит много времени на обработку.
assingn_lig <- function(data, group_col){
data %>%
dplyr::group_by(!!rlang::ensym(group_col)) %>%
mutate(
ligandId = case_when(
PDB == TRUE ~ ligandId ,
!is.na(ligandId) ~ ligandId ,
any( PDB == T ) & is.na(ligandId) ~ paste(unique(unlist(str_split(na.omit(ligandId), " ")) ), collapse = " "),
TRUE ~ NA_character_
)
) %>%
ungroup()
}
tmp2 %>%
assingn_lig(group_col = `1`) %>%
assingn_lig(group_col = `0.9`) %>%
assingn_lig(group_col = `0.8`) %>%
assingn_lig(group_col = `0.7`) %>%
assingn_lig(group_col = `0.6`) %>%
assingn_lig(group_col = `0.5`) %>%
assingn_lig(group_col = `0.4`) %>%
assingn_lig(group_col = `0.3`) %>%
assingn_lig(group_col = `0.2`)
Итак, кто-нибудь знает лучшую стратегию?
Этовывод:
# A tibble: 67 x 12
ID PDB ligandId `1` `0.9` `0.8` `0.7` `0.6` `0.5` `0.4` `0.3` `0.2`
<chr> <lgl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 WP_012391491.1/58-334 FALSE CL MSE RIB 138240 115828 110007 67870 8366 568 2884 9 1
2 WP_045025307.1/57-335 FALSE CL MSE RIB 50687 171070 126908 102232 8366 568 2884 9 1
3 WP_065911868.1/57-334 FALSE CL MSE RIB 1378 47923 41980 35151 23618 568 2884 9 1
4 WP_094130548.1/57-334 FALSE CL MSE RIB 126023 71525 81956 42345 32642 568 2884 9 1
5 WP_041093274.1/57-335 FALSE CL MSE RIB 237469 156529 146463 92357 70323 568 2884 9 1
6 WP_087741863.1/58-335 FALSE CL MSE RIB 124229 98825 7523 99040 78974 568 2884 9 1
7 WP_048735837.1/58-335 FALSE CL MSE RIB 41453 165289 7523 99040 78974 568 2884 9 1
8 WP_024526760.1/58-335 FALSE CL MSE RIB 217657 189247 7523 99040 78974 568 2884 9 1
9 YP_006375059.1/60-339 FALSE CL MSE RIB 91126 31455 13169 6533 1269 28283 2884 9 1
10 4RK1_A TRUE CL MSE RIB 183150 31455 13169 6533 1269 28283 2884 9 1
# ? with 57 more rows
> dput(tmp1)
structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335",
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335",
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335",
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337",
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332",
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332",
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331",
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334",
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321",
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326",
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324",
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338",
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337",
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348",
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340",
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343",
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344",
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331",
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339",
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337",
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340",
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE), ligandId = c("CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "RIB", "RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL MSE", "GOL MSE"), `1` = c(138240, 50687, 1378, 126023, 237469,
124229, 41453, 217657, 91126, 183150, 107179, 6274, 262561, 176386,
148744, 74195, 93951, 264634, 95933, 96095, 183149, 188356, 250907,
252918, 243804, 66122, 207410, 87437, 181530, 187508, 16752,
81323, 265938, 58828, 13021, 638, 638, 28566, 4878, 89140, 122445,
155228, 125955, 164812, 120428, 55374, 257779, 219988, 170251,
116415, 120125, 14694, 208562, 78746, 116429, 115371, 21301,
27608, 88975, 166418, 252468, 197645, 123797, 220728, 220653,
182317, 183297), `0.9` = c(115828, 171070, 47923, 71525, 156529,
98825, 165289, 189247, 31455, 31455, 88210, 61492, 196793, 131804,
143158, 10327, 10327, 10327, 26521, 26521, 31058, 31058, 123582,
121348, 186492, 34988, 162176, 25258, 25258, 138442, 15902, 15902,
196359, 62175, 46251, 147381, 147381, 56135, 59451, 76614, 102076,
130255, 71453, 114643, 96504, 58676, 197451, 192775, 114280,
23991, 23991, 53944, 190660, 69970, 97189, 94703, 43808, 33373,
76685, 111928, 125148, 137147, 98782, 193109, 193143, 26538,
26538), `0.8` = c(110007, 126908, 41980, 81956, 146463, 7523,
7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337, 144486,
68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235, 93313,
91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172, 141549,
44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802, 119491,
82179, 108083, 39098, 131355, 142185, 137368, 106528, 8338, 8338,
8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429, 105638,
23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870,
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533,
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130,
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437,
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402,
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867,
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470,
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718,
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974,
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269,
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944,
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954,
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955,
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836,
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467,
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568,
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283,
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445,
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080,
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047,
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093,
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304,
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941,
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -67L))
Любая помощь приветствуется.Заранее спасибо.