У меня есть фрейм данных со следующей структурой:
# A tibble: 95 x 7
# Groups: WallReg_2p5 [19]
CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1 piC_2 piC_3 piC_4
<int> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 6561 0.915 African 6.55 6.63 5.84 0.766
2 6278 0.947 African 15.1 5.59 2.15 2.01
3 4394 0.971 African 11.4 3.92 0.774 1.47
4 4840 0.994 African 4.70 0.962 6.21 3.54
5 4105 0.947 African 6.35 2.10 2.25 3.24
6 5228 1.000 Amazonian 8.49 5.00 1.92 2.42
7 5089 1.000 Amazonian 15.6 6.48 2.53 2.89
8 4939 0.998 Amazonian 5.56 2.94 0.389 2.44
9 5088 1.000 Amazonian 12.9 5.16 1.99 3.13
10 4947 0.998 Amazonian 8.05 11.2 2.54 4.61
# ... with 85 more rows
Вот dput()
подмножества фрейма данных.Мой реальный набор данных состоит из 10 368 строк и 255 611 столбцов
structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L,
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L,
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L,
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L,
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L,
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L,
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L,
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L,
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L,
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L,
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106,
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909,
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603,
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702,
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261,
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545,
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545,
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069,
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603,
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232,
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882,
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799,
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929,
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947,
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947,
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447,
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447,
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688,
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261,
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603,
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447,
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545,
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929,
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508,
0.831469612302545), WallReg_2p5 = c("African", "African", "African",
"African", "African", "Amazonian", "Amazonian", "Amazonian",
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian",
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian",
"Australian", "Australian", "Australian", "Australian", "Chinese",
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian",
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian",
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan",
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan",
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan",
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican",
"Mexican", "Mexican", "Mexican", "Mexican", "North American",
"North American", "North American", "North American", "North American",
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic",
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental",
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian",
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian",
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian",
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American",
"South American", "South American", "South American", "South American",
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684,
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572,
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577,
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576,
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176,
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767,
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198,
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335,
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193,
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283,
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873,
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809,
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784,
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095,
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443,
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618,
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367,
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012,
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888,
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212,
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805,
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088,
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873,
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355,
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613,
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488,
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422,
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539,
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119,
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407,
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303,
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346,
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869,
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828,
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928,
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788,
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704,
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585,
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207,
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562,
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437,
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713,
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509,
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068,
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422,
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917,
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119,
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123,
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781,
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968,
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616,
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917,
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346,
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805,
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413,
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992,
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209,
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967,
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531,
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385,
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844,
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772,
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857,
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059,
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296,
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745,
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558,
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331,
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209,
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029,
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229,
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262,
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234,
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363,
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348,
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061,
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015,
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442,
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402,
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654,
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112,
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219,
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595,
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995,
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808,
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109,
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584,
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103,
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674,
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275,
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956,
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043,
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166,
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414,
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877,
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643,
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting",
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44,
45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89,
90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian",
"Australian", "Chinese", "Eurasian", "Guineo-Congolian",
"Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American",
"Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian",
"Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA,
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))
. Я пытаюсь создать взвешенные значения всех столбцов piC_
для каждого региона.Процесс для каждого столбца (x
) включает в себя 3 шага:
- умножить каждую строку в столбце
piC_x
на значения в Y_Coord_2p5Weighting
- Сумма взвешенного
piC_x
значения в каждой из WallReg_2p5
групп - Разделите суммированное
piC_x
значение на сумму значений в Y_Coord_2p5Weighting
для каждой из WallReg_2p5
групп
После некоторого чтения выясняется, что data.table
быстрее на больших наборах данных, чем dplyr
, но я открыт для использования либо пакета, либо даже базы r
.Я пытался сделать оба, но получаю неправильные результаты при использовании data.table
, и я беспокоюсь о скорости dplyr
, когда я применяю это к моему полному кадру данных.Вот что я пробовал до сих пор
dplyr
df <- df %>% tbl_df() %>%
group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
weighted <- df %>%
mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights
weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.82 8.82 8.82
2 Amazonian 10.1 10.1 10.1
3 Arctico-Siberian 14.5 14.5 14.5
4 Australian 8.21 8.21 8.21
5 Chinese 9.32 9.32 9.32
6 Eurasian 9.86 9.86 9.86
7 Guineo-Congolian 7.41 7.41 7.41
8 Indo-Malayan 13.4 13.4 13.4
9 Japanese 5.47 5.47 5.47
10 Madagascan 4.38 4.38 4.38
11 Mexican 6.10 6.10 6.10
12 North American 5.09 5.09 5.09
13 Novozelandic 14.6 14.6 14.6
14 Oriental 15.2 15.2 15.2
15 Panamanian 13.2 13.2 13.2
16 Papua-Melanesian 6.36 6.36 6.36
17 Saharo-Arabian 5.22 5.22 5.22
18 South American 13.2 13.2 13.2
19 Tibetan 7.01 7.01 7.01
Используя dplyr
, я получаю правильные значения.Однако, когда я использую data.table
, я получаю неправильные значения.Я основал свой код на вопросе здесь , но ясно, что я делаю что-то не так.
data.table
df <- df %>% group_by(WallReg_2p5) %>%
as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE
## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 9.27 9.27 9.27
2 Amazonian 9.27 9.27 9.27
3 Arctico-Siberian 9.27 9.27 9.27
4 Australian 9.27 9.27 9.27
5 Chinese 9.27 9.27 9.27
6 Eurasian 9.27 9.27 9.27
7 Guineo-Congolian 9.27 9.27 9.27
8 Indo-Malayan 9.27 9.27 9.27
9 Japanese 9.27 9.27 9.27
10 Madagascan 9.27 9.27 9.27
11 Mexican 9.27 9.27 9.27
12 North American 9.27 9.27 9.27
13 Novozelandic 9.27 9.27 9.27
14 Oriental 9.27 9.27 9.27
15 Panamanian 9.27 9.27 9.27
16 Papua-Melanesian 9.27 9.27 9.27
17 Saharo-Arabian 9.27 9.27 9.27
18 South American 9.27 9.27 9.27
19 Tibetan 9.27 9.27 9.27
Чтение?set()
утверждает, что он не может выполнять операции группировки, но я подумал, что, поскольку я уже определил свои группы, этот процесс будет работать.Я никогда не использовал data.table
раньше, поэтому любые рекомендации будут высоко оценены.