Я предполагаю, что 'desc
'не содержит нескольких 'mfr'
значений
Solution1:
import numpy as np
import pandas as pd
# original dataframe
df = pd.DataFrame({
'id': range(5),
'desc': ('This is text', 'John Doe ABC', 'John Doe', 'Something JKL', 'Something more'),
'mfr': ('ABC', 'DEF', 'DEF', 'GHI', 'JKL')
})
# for final merge
ori = df.copy()
# max words used in 'desc'
max_len = max(df.desc.apply(lambda x: len(x.split(' '))))
# unique 'mfr' values
uniq_mfr = df.mfr.unique().tolist()
# if list is less than max len, then pad with nan
def padding(lst, mx):
for i in range(mx):
if len(lst) < mx:
lst.append(np.nan)
return lst
df['desc'] = df.desc.apply(lambda x: x.split(' ')).apply(padding, args=(max_len,))
# each word makes 1 column
for i in range(max_len):
newcol = 'desc{}'.format(i)
df[newcol] = df.desc.apply(lambda x: x[i])
df.loc[~df[newcol].isin(uniq_mfr), newcol] = np.nan
# merge created columns into 1 by taking 'mfr' values only
df['desc'] = df[df.columns[3:]].fillna('').sum(axis=1).replace('', np.nan)
# create [ABC, ABC] type of column by merging two columns (desc & mfr)
df = df[df.columns[:3]]
df.desc.fillna(df.mfr, inplace=True)
df.desc = [[x, y] for x, y in zip(df.desc.tolist(), df.mfr.tolist())]
df = df[['id', 'desc']]
df = df.sort_values('desc').reset_index(drop=True)
# BELOW IS COMMON WITH SOLUTION2
# from here I borrowed the solution by @mimomu from below URL (slightly modified)
# try to get merged tuple based on the common elements
# /3066200/obedinit-spiski-kotorye-imeyt-obschie-elementy
import itertools
L = df.desc.tolist()
LL = set(itertools.chain.from_iterable(L))
for each in LL:
components = [x for x in L if each in x]
for i in components:
L.remove(i)
L += [tuple(set(itertools.chain.from_iterable(components)))]
# allocate merged tuple to 'desc'
df['desc'] = sorted(L)
# grouping by 'desc' value (tuple can be key list cannot be fyi...)
df['group'] = df.groupby('desc').grouper.group_info[0]
# merge with the original
df = df.drop('desc', axis=1).merge(ori, on='id', how='left')
df = df[['id', 'desc', 'mfr', 'group']]
Solution2 (2-я половина обычна для Solution1):
import numpy as np
import pandas as pd
# original dataframe
df = pd.DataFrame({
'id': range(5),
'desc': ('This is text', 'John Doe ABC', 'John Doe', 'Something JKL', 'Something more'),
'mfr': ('ABC', 'DEF', 'DEF', 'GHI', 'JKL')
})
# for final merge
ori = df.copy()
# unique 'mfr' values
uniq_mfr = df.mfr.unique().tolist()
# make desc entries as lists
df['desc'] = df.desc.apply(lambda x: x.split(' '))
# pick up mfr values in desc column otherwise nan
mfr_in_descs = []
for ds, ms in zip(df.desc, df.mfr):
for i, d in enumerate(ds):
if d in uniq_mfr:
mfr_in_descs.append(d)
continue
if i == (len(ds) - 1):
mfr_in_descs.append(np.nan)
# create column whose element is like [ABC, ABC]
df['desc'] = mfr_in_descs
df['desc'].fillna(df.mfr, inplace=True)
df['desc'] = [[x, y] for x, y in zip(df.desc.tolist(), df.mfr.tolist())]
df = df[['id', 'desc']]
df = df.sort_values('desc').reset_index(drop=True)
# BELOW IS COMMON WITH SOLUTION1
# from here I borrowed the solution by @mimomu from below URL (slightly modified)
# try to get merged tuple based on the common elements
# /3066200/obedinit-spiski-kotorye-imeyt-obschie-elementy
import itertools
L = df.desc.tolist()
LL = set(itertools.chain.from_iterable(L))
for each in LL:
components = [x for x in L if each in x]
for i in components:
L.remove(i)
L += [tuple(set(itertools.chain.from_iterable(components)))]
# allocate merged tuple to 'desc'
df['desc'] = sorted(L)
# grouping by 'desc' value (tuple can be key list cannot be fyi...)
df['group'] = df.groupby('desc').grouper.group_info[0]
# merge with the original
df = df.drop('desc', axis=1).merge(ori, on='id', how='left')
df = df[['id', 'desc', 'mfr', 'group']]
Из 2 приведенных выше решений я получаю те же результаты df
:
id desc mfr group
0 0 This is text ABC 0
1 1 John Doe ABC DEF 0
2 2 John Doe DEF 0
3 3 Something JKL GHI 1
4 4 Something more JKL 1