В Python:
import pandas as pd
# Creating dataframe for Table A
tableA = [(100, 'chocolate, sprinkles'), (101, 'chocolate, filled'), (102, 'glazed')]
labels = ['product', 'tags']
df_A = pd.DataFrame.from_records(tableA, columns=labels)
# Creating dataframe for Table B
tableB = [('A', 100), ('A', 101), ('B', 101), ('C', 100), ('C', 102), ('B', 101), ('A', 100), ('C', 102)]
labels = ['customer', 'product']
df_B = pd.DataFrame.from_records(tableB, columns=labels)
new_df = pd.merge(df_A, df_B, how='inner', on='product')
new_df = (new_df.set_index(new_df.columns.drop('tags', 1)
.tolist()).tags.str.split(', ', expand=True).stack().reset_index()
.rename(columns={0: 'tags'}).loc[:, new_df.columns])
final_df = new_df.pivot_table(values='tags', index=['customer'], columns=['tags'],
aggfunc='size')
final_df.fillna(0, inplace=True)
final_df = final_df.astype(int)
print(final_df)
Вывод:
tags chocolate filled glazed sprinkles
customer
A 3 1 0 2
B 2 2 0 0
C 1 0 2 1
Использование R:
library(tidyr)
library(dplyr)
library(reshape2)
library(data.table) ## or library(reshape2)
#Creating the tables
tableA <- data.frame("product" = c(100, 101, 102),
"tags" = c("chocolate, sprinkles", "chocolate, filled", "glazed"))
newA = separate_rows(tableA, "tags")
tableB <- data.frame("customer" = c('A', 'A', 'B', 'C', 'C', 'B', 'A', 'C'),
"product" = c(100, 101, 101, 100, 102, 101, 100, 102))
joinData = merge(newA, tableB, by=c('product'))
final_df = dcast(melt(as.data.table(joinData), id.vars = c("tags", "customer")),
customer ~ tags, value.var = "value")
final_df
Вывод:
> final_dfcena
customer chocolate filled glazed sprinkles
1: A 3 1 0 2
2: B 2 2 0 0
3: C 1 0 2 1