Принятие вашего исходного подхода в максимально возможной степени (с использованием mapreduce)
from functools import reduce
import csv
def mapper(f):
" Select fields of input data"
while f:
customer, _, _, product, cost = f.rstrip().split(',')
return customer, product, cost
def reducer(acc, v):
" Used by reduce to generate dictionary of products by customer and revenue"
customer, product, cost = v
if not product in acc:
acc[product] = {}
if not customer in acc[product]:
acc[product][customer] = []
acc[product][customer].append(cost)
return acc
def create_row(results, product):
" Generate stats row for a product "
customers = results[product]
return {"Product":product,
"CustomerCount": len(results[product]),
"TotalRevenue": sum(float(cost) for customer, costs in customers.items() for cost in costs)}
with open('test.csv') as ifile:
next(ifile)
data = map(mapper, ifile)
# Generate results as a nested dictionary
results = reduce(reducer, data, {})
# List of products in alphabetical order
products = sorted(results.keys())
# Show results as CSV File
fieldnames = ["Product", "CustomerCount", "TotalRevenue"]
with open("results.csv", "w") as ofile:
writer = csv.DictWriter(ofile, fieldnames=fieldnames)
writer.writeheader()
rows = map(lambda product: create_row(results, product), products)
writer.writerows(rows)
Тест
Входной файл
Customer, Transaction, Date, Product, Cost
X,1,02/02,A,10.99
X,1,02/02,B,4.99
X,2,04/02,A,9.99
Y,4,10/02,C,0.99
Y,5,03/03,D,13.99
Z,7,03/04,D,13.99
Z,9,07/05,B,5.99
Z,9,07/05,A,11.99
Выходной файл
Product, CustomerCount, TotalRevenue
A,2,32.97
B,2,10.98
C,1,0.99
D,2,27.98
Примечание: словарь результатов был
{'A': {'X': ['10.99', '9.99'], 'Z': ['11.99']},
'B': {'X': ['4.99'], 'Z': ['5.99']},
'C': {'Y': ['0.99']},
'D': {'Y': ['13.99'], 'Z': ['13.99']}}
Альтернативный подход с использованием Python Groupby
from itertools import groupby
with open('test.csv') as ifile, open('results.csv', 'w') as ofile:
next(ifile) # skip input file header
# Input Data as list of list
data = [line.rstrip().split(',') for line in ifile]
# Function key for sorting and grouping by product field in each sublist
keyfunc = lambda x: x[3] # product column
# Inplace sort
data.sort(key=keyfunc) # Sort by product
# Write Header
ofile.write('Product, CustomerCount, TotalRevenue' + '\n') # Header
# Process by grouping by product field
for product, g in groupby(data, keyfunc):
g = list(g)
customers = set(x[0] for x in g) # set of customers in current grouping
total_revenue = sum(float(x[4]) for x in g)
ofile.write(f'{product},{len(customers)},{total_revenue:.2f}\n')