R удалить группу, если столбец ВСЕ равны - PullRequest
0 голосов
/ 01 мая 2020

Я хотел бы удалить группу («кластер»), только если все значения x ИЛИ y одинаковы.

Пример данных:

structure(list(x = c(114.908, 115.166, 152.067, 115.17, 115.1116, 
115.133, 152.0666, 115.166999817, 115.148, 152.05, 115.16667, 
153.7, 152.100006104, 114.9083, 115.17, 114.995, 152.10001, 153.43416, 
114.917, 115.133003235, 114.916999817, 153.43, 115.166667, 115.167, 
152.1, 153.46133, 115.13, 153.69166, 152.083), y = c(-32.0383, 
-8.75, -33.483, -8.75, -32.1316, -32.1, -33.4666, -8.75, -32.0383, 
-33.416, -8.75, -29.883, -33.4333000183, -32.0383, -31.92, -31.745, 
-33.4333, -30.42583, -32, -32.0999984741, -32, -30.43, -8.75, 
-8.75, -33.43, -30.24382, -32.1, -29.89166, -33.45833), Cluster = c(1L, 
2L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 3L, 2L, 4L, 3L, 1L, 1L, 1L, 3L, 
4L, 1L, 1L, 1L, 4L, 2L, 2L, 3L, 4L, 1L, 4L, 3L)), row.names = c(1L, 
3L, 4L, 6L, 8L, 9L, 10L, 11L, 12L, 15L, 17L, 19L, 20L, 21L, 22L, 
23L, 25L, 28L, 36L, 38L, 44L, 48L, 58L, 60L, 61L, 66L, 67L, 74L, 
77L), class = "data.frame")

Так что для этой группы только для DF 2 будет удалено, потому что ВСЕ значения y одинаковы.

РЕДАКТИРОВАТЬ: добавлен еще один примерный набор данных Таким образом, решение работало для большинства DF, но каким-то образом группа / кластер 12 не будут удалены даже если значения Y все одинаковы ..

structure(list(x = c(-85.05, -78.1691665649, -114.669998169, 
-111.669998169, -85.0500030518, -84.7300033569, -110.870002747, 
-105.199996948, -84.7399978638, -80.3099975586, -85.6999969482, 
-112.080001831, -81.8000030518, -105.279998779, -110.339996338, 
-78.2152786255, -80.8166656494, -110.339996338, -84.9599990845, 
-111.050003052, -85.0500030518, -110.970001221, -78.0400009155, 
-85.0800018311, -84.9700012207, -80.3000030518, -85.0299987793, 
-78.1594467163, -105.230003357, -84.9199981689, -85.0299987793, 
-94.6399993896, -78.7200012207, -114.720001221, -110.279998779, 
-80.8170013428, -111, -85.9199981689, -99.9300003052, -81.7799987793, 
-110.849998474, -79, -114.669998169, -80.8199996948, -110.800003052, 
-110.540000916, -78.2099990845, -85.7, -81.8000030518, -78.8499984741, 
-85.696548, -85.696538, -85.65425, -79.053206, -80.003386, -111.047195, 
-110.817841, -110.817736, -105.25911, -92.860468, -81.70401, 
-79.006, -82.0184, -93.418611, -93.339444, -93.598611, -95.018333, 
-111.048697, -111.050267, -110.908172, -94.4375, -99.9153, -99.9163, 
-99.9185, -93.418333, -93.183056, -93.512222, -93.726389, -93.228333, 
-94.269167, -94.960833, -95.68, -92.85, -92.850833, -92.851667, 
-92.85, -85.7122, -85.796834, -92.402778, -111.452528, -111.747278, 
-111.919933, -80.817, -111.744028, -114.83, -114.8322, -85.05, 
-92.750533, -95.240417, -105.18, -105.191667, -105.283333, -105.183334, 
-111.666667, -111.66, -110.839201, -114.733333, -114.728758, 
-110.8, -110.8, -110.34, -110.341667, -110.96, -110.966667, -105.19, 
-105.196667, -114.666667, -114.66, -105.233333, -105.23, -105.283333, 
-105.28, -114.716667, -114.71, -111.05, -111.05, -110.9, -111.733333, 
-114.8, -78.858333, -112.08, -112.083333, -80.075783, -99.941667, 
-99.93, -114.783333, -79, -105.283333, -110.833333, -111.451112, 
-111.747946, -110.892009), y = c(10.0833333333, 2.97388887405, 
18.3799991608, 24.5, 10.0799999237, 9.93000030518, 27.9200000763, 
20.6499996185, 9.86999988556, 7.36999988556, 10.5500001907, 24.5900001526, 
7.59999990463, 21.5200004578, 24.1299991608, 2.92888879776, -1.54999995232, 
24, 10.0600004196, 27.9300003052, 10.0833330154, 18.75, 2.84999990463, 
10.1099996567, 10.0600004196, 7.92999982834, 10.0500001907, 3.00138878822, 
21.1700000763, 10.0200004578, 10.1199998856, 15.9799995422, 8.69999980927, 
18.3500003815, 24.1200008392, -1.54999995232, 19, 10.8599996567, 
16.8500003815, 7.84000015259, 27.9699993134, 8.42000007629, 18.3199996948, 
-1.54999995232, 19.3199996948, 24.3199996948, 2.99000000954, 
10.55, 7.28000020981, 8.44999980927, 10.573107, 10.573117, 10.68032, 
8.623806, 7.622776, 18.796213, 19.310653, 19.311124, 20.558141, 
15.169063, 7.634792, 8.5307, 7.9751, 15.528611, 15.468889, 15.678889, 
16.068889, 18.779722, 18.778136, 18.77845, 16.135833, 16.827, 
16.829, 16.8236, 15.618611, 15.335556, 15.633889, 15.775833, 
15.301944, 16.042222, 16.167778, 15.9, 15.160278, 15.160278, 
15.158056, 15.16, 10.5786, 11.136725, 14.704167, 28.319333, 28.5515, 
28.804092, -1.55, 28.535639, 31.02, 31.0225, 10.083333, 14.8833, 
16.099567, 21.43, 21.441667, 21.533333, 21.433332, 24.5, 24.5, 
27.954582, 18.366667, 18.346264, 19.316667, 19.31, 24.13, 24, 
18.75, 18.75, 20.65, 20.65, 18.383333, 18.38, 21.166667, 21.16, 
21.516667, 21.51, 18.35, 18.35, 27.933333, 27.93, 27.933333, 
24.483333, 31.33, 8.458333, 24.59, 24.591667, 7.357215, 16.858333, 
16.85, 31.3, 8.416667, 21.183333, 27.866667, 28.317249, 28.551499, 
27.909534), Cluster = c(1L, 2L, 3L, 4L, 1L, 1L, 5L, 6L, 1L, 7L, 
8L, 4L, 9L, 10L, 11L, 2L, 12L, 11L, 1L, 5L, 1L, 13L, 2L, 1L, 
1L, 7L, 1L, 2L, 10L, 1L, 1L, 14L, 15L, 3L, 11L, 12L, 13L, 8L, 
16L, 9L, 5L, 15L, 3L, 12L, 13L, 11L, 2L, 8L, 9L, 15L, 8L, 8L, 
8L, 15L, 7L, 13L, 13L, 13L, 6L, 17L, 9L, 15L, 9L, 17L, 17L, 17L, 
14L, 13L, 13L, 13L, 14L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 
14L, 14L, 14L, 17L, 17L, 17L, 17L, 8L, 8L, 17L, 18L, 18L, 18L, 
12L, 18L, 19L, 19L, 1L, 17L, 14L, 10L, 10L, 10L, 10L, 4L, 4L, 
5L, 3L, 3L, 13L, 13L, 11L, 11L, 13L, 13L, 6L, 6L, 3L, 3L, 10L, 
10L, 10L, 10L, 3L, 3L, 5L, 5L, 5L, 4L, 19L, 15L, 4L, 4L, 7L, 
16L, 16L, 19L, 15L, 10L, 5L, 18L, 18L, 5L)), row.names = c(NA, 
-142L), class = c("tbl_df", "tbl", "data.frame"))

1 Ответ

4 голосов
/ 01 мая 2020

Использование dplyr:

library(dplyr)

df %>% group_by(Cluster) %>% filter(!(n_distinct(x) == 1 | n_distinct(y) == 1))

#      x     y Cluster
#   <dbl> <dbl>   <int>
# 1  115. -32.0       1
# 2  152. -33.5       3
# 3  115. -32.1       1
# 4  115. -32.1       1
# 5  152. -33.5       3
# 6  115. -32.0       1
# 7  152. -33.4       3
# 8  154. -29.9       4
# 9  152. -33.4       3
#10  115. -32.0       1
# … with 13 more rows

Использование того же логика c с data.table:

library(data.table)
setDT(df)[, .SD[!(uniqueN(x) == 1 | uniqueN(y) == 1)], Cluster]
...