Я пытаюсь подвести итог по определенному столбцу, но у меня возникают проблемы с тем, что он не возвращает разные значения.
Ниже я представил, как я читаю данные и код, который я использую:
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
"MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
"VA-WA", "WB-ZZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "TEST2.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2018-01-01"), as.Date("2018-05-31"))
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
## CODE ##
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
if (read_this_file) {
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
.SDcols = c("Ext Sale")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
Код ниже, который включен выше, является основным фокусом, который я считаю
temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
.SDcols = c("Ext Sale")]
После этого я ожидаю, что все продажи будут объединены по INV_ITEM_ID.
Проблема в том, что он дает мне около 5 миллионов строк с всего лишь 900 000 различных значений.
Колонка продаж не суммирует, как я хочу.
Ниже приведено подмножество данных, которые я использую.
dput (головка (DataTable, 30))
structure(list(INVOICE_DT = c("2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29"), BRANCH_CODE = c("AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001"
), INVOICE_NO = c("AK0015522", "AK0015529", "AK0015515", "AK0015529",
"AK0015514", "AK0015513", "AK0015519", "AK0015519", "AK0015520",
"AK0015520", "AK0015520", "AK0015522", "AK0015520", "AK0015520",
"AK0015520", "AK0015518", "AK0015525", "AK0015520", "AK0015519",
"AK0015520", "AK0015525", "AK0015518", "AK0015529", "AK0015519",
"AK0015525", "AK0015519", "AK0015519", "AK0015522", "AK0015522",
"AK0015529"), INV_SEQ_NO = c(6L, 6L, 3L, 3L, 3L, 3L, 148L, 149L,
91L, 45L, 162L, 34L, 125L, 39L, 159L, 56L, 64L, 127L, 160L, 93L,
45L, 34L, 5L, 138L, 46L, 38L, 161L, 35L, 28L, 4L), INV_ITEM_ID = c("11239190",
"11239190", "55692", "400050-131280", "0700535", "0800758", "69970",
"0445050", "0819092", "0115160", "0115160", "0703316", "55692",
"0115157", "0115157", "0115157", "440704-131280", "55618", "0703318",
"0205428", "0702119", "0115157", "465278", "465278", "0702121",
"15166", "0703315", "0703315", "0715718", "0243945"), ITEM_DESCR = c("Au-Ve-Co# 20898",
"Au-Ve-Co# 20898", "MM HSS T TAP 16X1.5", "3/16\"\"\"\"H05703-100R Hs",
"18/2 BLK SOOW", "3x1x1/4 180G FW", "BF SPEC SER HV1-6C", "1/4MP External Seat",
"1-1/2xNHConditonDisc", "HCS 7/16-14x1 3/4 YZ8", "HCS 7/16-14x1 3/4 YZ8",
"25AMP MINI AUTO FUSE", "MM HSS T TAP 16X1.5", "HCS 7/16-14x1.25 YZ8",
"HCS 7/16-14x1.25 YZ8", "HCS 7/16-14x1.25 YZ8", "BPOLY MELBW 1/4TX1/8",
"MM HSS P TAP 5 X 0.8", "MAXI-BLADE 20 AMP", "1-1/2\"\"\"\" T3 VF SURF",
"16ga Red Wire/PPF", "HCS 7/16-14x1.25 YZ8", "80I 45 ELBOW 1/4 BLK",
"80I 45 ELBOW 1/4 BLK", "16ga Blk Wire/PPF", "HCS 7/16-14x3.25 YZ8",
"20AMP MINI AUTO FUSE", "20AMP MINI AUTO FUSE", "3/0-3/8\"\"\"\"HvyWalCprLug",
"97053 Pink Marker"), STD_ITEM = c("Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), PRIVATE_LABEL = c("N",
"N", "Y", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N",
"N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N",
"N", "N", "N"), CATEGORY_PATH1 = c("08.5-Fleet & Automotive",
"08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking",
"08-Hydraulics & Pneumatics", "09-Electrical", "05-Abrasives",
"08-Hydraulics & Pneumatics", "07.3-Plumbing", "05-Abrasives",
"01-Fasteners", "01-Fasteners", "08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking",
"01-Fasteners", "01-Fasteners", "01-Fasteners", "08-Hydraulics & Pneumatics",
"07-Cutting Tools & Metalworking", "08.5-Fleet & Automotive",
"05-Abrasives", "08.5-Fleet & Automotive", "01-Fasteners", "07.3-Plumbing",
"07.3-Plumbing", "08.5-Fleet & Automotive", "01-Fasteners", "08.5-Fleet & Automotive",
"08.5-Fleet & Automotive", "08.5-Fleet & Automotive", "11-Chemicals & Paints"
), CATEGORY_PATH2 = c("03-Batteries & Accessories", "03-Batteries & Accessories",
"04-Threading and Tapping", "02-Hose and Hose Reels", "05-Electrical Wire and Accessories",
"03-Coated and Non-Woven Abrasives", "07-Valves & Accessories",
"04-Valves & Valve Accessories", "03-Coated and Non-Woven Abrasives",
"01-Bolts", "01-Bolts", "02-Fleet Electrical Products", "04-Threading and Tapping",
"01-Bolts", "01-Bolts", "01-Bolts", "01-Fittings", "04-Threading and Tapping",
"02-Fleet Electrical Products", "03-Coated and Non-Woven Abrasives",
"02-Fleet Electrical Products", "01-Bolts", "01-Pipe Fittings",
"01-Pipe Fittings", "02-Fleet Electrical Products", "01-Bolts",
"02-Fleet Electrical Products", "02-Fleet Electrical Products",
"02-Fleet Electrical Products", "11-Paints, Marking, and Accessories"
), CUST_ID = c("AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001"), CUSTOMER_TIER = c("Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3"),
IS_VENDING = c("No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No"), SALE_PRICE = c(10.53, 10.53, 23.0175,
2.0251, 0.4755, 6.4725, 15.8235, 1.468, 1.0275, 0.3425, 0.3425,
0.3726, 23.0175, 0.2513, 0.2513, 0.2513, 2.7412, 5.7675,
2.226, 1.5579, 0.1418, 0.2513, 12.0835, 12.0835, 0.1297,
0.625, 0.3726, 0.3726, 3.514, 4.1112), TOTAL_COST = c(6.877449,
6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787,
0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562,
0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503,
0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628,
0.164998, 0.125893, 0.125893, 2.183983, 1.469989), POS_COST = c(6.877449,
6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787,
0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562,
0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503,
0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628,
0.164998, 0.125893, 0.125893, 2.183983, 1.469989), CE100 = c(6.55,
6.55, 9.25, 0.95, 0.23785, 2.81, 5.93127, 1.17, 0.508, 0.074946,
0.074946, 0.12, 9.25, 0.058626, 0.058626, 0.058626, 1.31,
2.35, 0.61, 0.5, 0.05172, 0.058626, 4.2844, 4.2844, 0.05172,
0.12255, 0.12, 0.12, 2.08, 1.43), CE110 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), CE120 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CE200 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CORP_PRICE = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), QTY_SOLD = c(15L,
10L, 6L, 50L, 250L, 10L, 10L, 20L, 50L, 25L, 25L, 50L, 6L,
25L, 25L, 25L, 10L, 6L, 10L, 30L, 100L, 25L, 15L, 10L, 100L,
20L, 50L, 50L, 4L, 36L), PACKSLIP_WHSL = c(NA, NA, 30.69,
2.63, 0.7315, 8.63, 20.55, 1.96, 1.37, 0.685, 0.685, 0.5323,
30.69, 0.5025, 0.5025, 0.5025, 3.56, 7.69, 3.18, 1.82, 0.2145,
0.5025, 18.59, 18.59, 0.1961, 1.25, 0.5323, 0.5323, 5.02,
5.71), PRICING_GROUP = c("PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGG", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB"), PGG_MIN_PRICE = c(257.9043375,
171.936225, 184.14, 131.5, 175.56, 86.3, 154.125, 39.2, 68.5,
15.07, 15.07, 26.615, 184.14, 11.055, 11.055, 11.055, 35.6,
46.14, 31.8, 54.6, 21.45, 11.055, 253.7535, 185.9, 19.61,
22, 26.615, 26.615, 20.08, 205.56), PGY_MIN_PRICE = c(206.32347,
137.54898, 171.2502, 128.87, 170.07375, 80.259, 147.96, 36.456,
63.705, 13.3575, 13.3575, 24.75195, 171.2502, 9.79875, 9.79875,
9.79875, 33.108, 45.2172, 29.574, 50.778, 19.9485, 9.79875,
245.388, 163.592, 18.2373, 19.5, 24.75195, 24.75195, 18.6744,
191.1708), PGR_MIN_PRICE = c(171.936225, 114.62415, 162.0432,
124.925, 166.41625, 75.944, 141.795, 35.672, 60.28, 9.9325,
9.9325, 24.21965, 162.0432, 7.28625, 7.28625, 7.28625, 32.396,
43.833, 28.938, 48.048, 19.5195, 7.28625, 239.811, 159.874,
17.8451, 14.5, 24.21965, 24.21965, 18.2728, 180.8928), `Ext Sale` = c(157.95,
105.3, 138.105, 101.255, 118.875, 64.725, 158.235, 29.36,
51.375, 8.5625, 8.5625, 18.63, 138.105, 6.2825, 6.2825, 6.2825,
27.412, 34.605, 22.26, 46.737, 14.18, 6.2825, 181.2525, 120.835,
12.97, 12.5, 18.63, 18.63, 14.056, 148.0032), `Ext Total Cost` = c(103.161735,
68.77449, 56.397372, 46.24365, 61.282, 29.53287, 62.27787,
14.06988, 26.0175, 2.08635, 2.08635, 6.71425, 56.397372,
1.56715, 1.56715, 1.56715, 13.493, 14.317644, 7.7503, 15.74988,
5.1595, 1.56715, 67.1673, 44.7782, 5.1628, 3.29996, 6.29465,
6.29465, 8.735932, 52.919604), cust_part = c("AK001000111239190",
"AK001000111239190", "AK001000155692", "AK0010001400050-131280",
"AK00100010700535", "AK00100010800758", "AK001000169970",
"AK00100010445050", "AK00100010819092", "AK00100010115160",
"AK00100010115160", "AK00100010703316", "AK001000155692",
"AK00100010115157", "AK00100010115157", "AK00100010115157",
"AK0010001440704-131280", "AK001000155618", "AK00100010703318",
"AK00100010205428", "AK00100010702119", "AK00100010115157",
"AK0010001465278", "AK0010001465278", "AK00100010702121",
"AK001000115166", "AK00100010703315", "AK00100010703315",
"AK00100010715718", "AK00100010243945")), .Names = c("INVOICE_DT",
"POS_COST", "CE100", "CE110", "CE120", "CE200", "CORP_PRICE",
"PGY_MIN_PRICE", "PGR_MIN_PRICE", "Ext Sale", "Ext Total Cost",
"cust_part"), .internal.selfref = <pointer: (nil)>, row.names = c(NA,
30L), class = c("data.table", "data.frame"))