Умножьте и разделите много столбцов на другой столбец для каждой группы. - PullRequest
0 голосов
/ 25 мая 2018

У меня есть фрейм данных со следующей структурой:

# A tibble: 95 x 7
# Groups:   WallReg_2p5 [19]
   CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1  piC_2 piC_3 piC_4
        <int>                <dbl> <chr>       <dbl>  <dbl> <dbl> <dbl>
 1       6561                0.915 African      6.55  6.63  5.84  0.766
 2       6278                0.947 African     15.1   5.59  2.15  2.01 
 3       4394                0.971 African     11.4   3.92  0.774 1.47 
 4       4840                0.994 African      4.70  0.962 6.21  3.54 
 5       4105                0.947 African      6.35  2.10  2.25  3.24 
 6       5228                1.000 Amazonian    8.49  5.00  1.92  2.42 
 7       5089                1.000 Amazonian   15.6   6.48  2.53  2.89 
 8       4939                0.998 Amazonian    5.56  2.94  0.389 2.44 
 9       5088                1.000 Amazonian   12.9   5.16  1.99  3.13 
10       4947                0.998 Amazonian    8.05 11.2   2.54  4.61 
# ... with 85 more rows

Вот dput() подмножества фрейма данных.Мой реальный набор данных состоит из 10 368 строк и 255 611 столбцов

structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L, 
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L, 
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L, 
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L, 
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L, 
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L, 
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L, 
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L, 
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L, 
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L, 
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106, 
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909, 
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603, 
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702, 
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261, 
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545, 
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545, 
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069, 
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603, 
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232, 
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882, 
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799, 
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929, 
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947, 
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947, 
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447, 
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447, 
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688, 
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261, 
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603, 
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447, 
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545, 
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929, 
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508, 
0.831469612302545), WallReg_2p5 = c("African", "African", "African", 
"African", "African", "Amazonian", "Amazonian", "Amazonian", 
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian", 
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian", 
"Australian", "Australian", "Australian", "Australian", "Chinese", 
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian", 
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian", 
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan", 
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan", 
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan", 
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican", 
"Mexican", "Mexican", "Mexican", "Mexican", "North American", 
"North American", "North American", "North American", "North American", 
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic", 
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental", 
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian", 
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian", 
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian", 
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American", 
"South American", "South American", "South American", "South American", 
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684, 
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572, 
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577, 
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576, 
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176, 
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767, 
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198, 
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335, 
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193, 
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283, 
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873, 
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809, 
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784, 
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095, 
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443, 
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618, 
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367, 
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012, 
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888, 
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212, 
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805, 
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088, 
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873, 
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355, 
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613, 
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488, 
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422, 
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539, 
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119, 
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407, 
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303, 
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346, 
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869, 
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828, 
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928, 
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788, 
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704, 
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585, 
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207, 
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562, 
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437, 
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713, 
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509, 
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068, 
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422, 
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917, 
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119, 
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123, 
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781, 
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968, 
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616, 
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917, 
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346, 
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805, 
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413, 
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992, 
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209, 
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967, 
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531, 
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385, 
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844, 
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772, 
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857, 
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059, 
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296, 
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745, 
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558, 
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331, 
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209, 
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029, 
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229, 
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262, 
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234, 
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363, 
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348, 
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061, 
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015, 
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442, 
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402, 
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654, 
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112, 
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219, 
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595, 
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995, 
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808, 
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109, 
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584, 
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103, 
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674, 
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275, 
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956, 
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043, 
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166, 
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414, 
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877, 
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643, 
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting", 
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
    0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 
    45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89, 
    90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
    WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian", 
    "Australian", "Chinese", "Eurasian", "Guineo-Congolian", 
    "Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American", 
    "Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian", 
    "Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA, 
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))

. Я пытаюсь создать взвешенные значения всех столбцов piC_ для каждого региона.Процесс для каждого столбца (x) включает в себя 3 шага:

  1. умножить каждую строку в столбце piC_x на значения в Y_Coord_2p5Weighting
  2. Сумма взвешенного piC_x значения в каждой из WallReg_2p5 групп
  3. Разделите суммированное piC_x значение на сумму значений в Y_Coord_2p5Weighting для каждой из WallReg_2p5 групп

После некоторого чтения выясняется, что data.table быстрее на больших наборах данных, чем dplyr, но я открыт для использования либо пакета, либо даже базы r.Я пытался сделать оба, но получаю неправильные результаты при использовании data.table, и я беспокоюсь о скорости dplyr, когда я применяю это к моему полному кадру данных.Вот что я пробовал до сих пор

dplyr

df <- df %>% tbl_df() %>% 
  group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

weighted <- df %>%
  mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
  mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
  mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights

weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           8.82  8.82  8.82
 2 Amazonian        10.1  10.1  10.1 
 3 Arctico-Siberian 14.5  14.5  14.5 
 4 Australian        8.21  8.21  8.21
 5 Chinese           9.32  9.32  9.32
 6 Eurasian          9.86  9.86  9.86
 7 Guineo-Congolian  7.41  7.41  7.41
 8 Indo-Malayan     13.4  13.4  13.4 
 9 Japanese          5.47  5.47  5.47
10 Madagascan        4.38  4.38  4.38
11 Mexican           6.10  6.10  6.10
12 North American    5.09  5.09  5.09
13 Novozelandic     14.6  14.6  14.6 
14 Oriental         15.2  15.2  15.2 
15 Panamanian       13.2  13.2  13.2 
16 Papua-Melanesian  6.36  6.36  6.36
17 Saharo-Arabian    5.22  5.22  5.22
18 South American   13.2  13.2  13.2 
19 Tibetan           7.01  7.01  7.01

Используя dplyr, я получаю правильные значения.Однако, когда я использую data.table, я получаю неправильные значения.Я основал свой код на вопросе здесь , но ясно, что я делаю что-то не так.

data.table

df <- df %>% group_by(WallReg_2p5) %>%
  as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE

## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
  set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
  set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
  set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           9.27  9.27  9.27
 2 Amazonian         9.27  9.27  9.27
 3 Arctico-Siberian  9.27  9.27  9.27
 4 Australian        9.27  9.27  9.27
 5 Chinese           9.27  9.27  9.27
 6 Eurasian          9.27  9.27  9.27
 7 Guineo-Congolian  9.27  9.27  9.27
 8 Indo-Malayan      9.27  9.27  9.27
 9 Japanese          9.27  9.27  9.27
10 Madagascan        9.27  9.27  9.27
11 Mexican           9.27  9.27  9.27
12 North American    9.27  9.27  9.27
13 Novozelandic      9.27  9.27  9.27
14 Oriental          9.27  9.27  9.27
15 Panamanian        9.27  9.27  9.27
16 Papua-Melanesian  9.27  9.27  9.27
17 Saharo-Arabian    9.27  9.27  9.27
18 South American    9.27  9.27  9.27
19 Tibetan           9.27  9.27  9.27

Чтение?set() утверждает, что он не может выполнять операции группировки, но я подумал, что, поскольку я уже определил свои группы, этот процесс будет работать.Я никогда не использовал data.table раньше, поэтому любые рекомендации будут высоко оценены.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...