Этот ответ включает в себя проблему борьбы с несколькими
значения в строках столбца OrderID (s).
Полный код без комментариев в конце ниже
# imports
import pandas as pd
import numpy as np
# create sample dataframe
df_orig = \
pd.DataFrame({'OrderID(s)':['0001, 0007, 0002', '0008', '0009, 0005, 0003',],
'Lead Order': ['00011', '00022', '00033']})
df_orig
OrderID(s) Lead Order
0 0001, 0007, 0002 00011
1 0008 00022
2 0009, 0005, 0003 00033
-
# force df values to strings
# this makes splitting of multiple
# values in OrderID(s) easier
df_orig = df_orig.astype(str)
# series created from data within df_orig['OrderID(s)'] column
# remove spaces and split by commas
split_col = df_orig['OrderID(s)'].str.replace(' ', '').str.split(",")
print(split_col)
0 [0001, 0007, 0002]
1 [0008]
2 [0009, 0005, 0003]
Name: OrderID(s), dtype: object
-
# find length of each split_col row (how many OrderIDs in each row).
# these values will be used to duplicate rows in the
# df_orig dataframe with the numpy repeat function
repeats = split_col.str.len().values
print(repeats)
[3 1 3]
-
# concatenate all values in orderid_column.
# the length of this array will be the same as the length
# of the df_stack_ids dataframe
orderid_col = np.concatenate(split_col.values)
print(orderid_col)
['0001' '0007' '0002' '0008' '0009' '0005' '0003']
-
# use pandas iloc and numpy repeat function to make a dataframe with
# rows from df_orig duplicated according to the number of
# df_orig['OrderID(s)'] values in each row relating to a common
#Lead Order value (using repeats input from above)
df_stack_ids = df_orig.iloc[np.repeat(df_orig.index.values, repeats)]. \
reset_index(drop=True)
df_stack_ids
OrderID(s) Lead Order
0 0001, 0007, 0002 00011
1 0001, 0007, 0002 00011
2 0001, 0007, 0002 00011
3 0008 00022
4 0009, 0005, 0003 00033
5 0009, 0005, 0003 00033
6 0009, 0005, 0003 00033
-
# add the orderid_col to dataframe
df_stack_ids['OrderID'] = orderid_col
df_stack_ids
OrderID(s) Lead Order OrderID
0 0001, 0007, 0002 00011 0001
1 0001, 0007, 0002 00011 0007
2 0001, 0007, 0002 00011 0002
3 0008 00022 0008
4 0009, 0005, 0003 00033 0009
5 0009, 0005, 0003 00033 0005
6 0009, 0005, 0003 00033 0003
-
# get rid of the original OrderID(s) column
df_stack_ids = df_stack_ids[['OrderID', 'Lead Order']]
# this may be enough to answer the question
# because each order id has a corresponding
# lead order
df_stack_ids
OrderID Lead Order
0 0001 00011
1 0007 00011
2 0002 00011
3 0008 00022
4 0009 00033
5 0005 00033
6 0003 00033
-
# to find matches for a specific list of order ids,
# continue...
# sort the OrderID column for easy reference and
# reset index
df_stack_ids = df_stack_ids.sort_values(by=['OrderID'])
df_stack_ids.index = range(len(df_stack_ids))
# create sample dataframe with a few order ids for lookup
df_find_lead = pd.DataFrame({'OrderID': ['0001', '0002', '0005']})
# force to string type for matching with df_stack_ids values
# when merging
df_find_lead = df_find_lead.astype(str)
df_find_lead
OrderID
0 0001
1 0002
2 0005
-
# merge values from df_stack_ids['Lead Order'] column
df_found_lead = pd.merge(df_find_lead, df_stack_ids,
on=['OrderID'], how='inner')
df_found_lead
OrderID Lead Order
0 0001 00011
1 0002 00011
2 0005 00033
-
# if all original order data is formatted as numbers,
# convert result dataframe back to integers
df_found_lead.astype(int)
OrderID Lead Order
0 1 11
1 2 11
2 5 33
Полный код:
import pandas as pd
import numpy as np
df_orig = \
pd.DataFrame({'OrderID(s)':['0001, 0007, 0002', '0008', '0009, 0005, 0003',],
'Lead Order': ['00011', '00022', '00033']})
df_orig = df_orig.astype(str)
split_col = df_orig['OrderID(s)'].str.replace(' ', '').str.split(",")
repeats = split_col.str.len().values
orderid_col = np.concatenate(split_col.values)
df_stack_ids = df_orig.iloc[np.repeat(df_orig.index.values, repeats)]. \
reset_index(drop=True)
df_stack_ids['OrderID'] = orderid_col
df_stack_ids = df_stack_ids[['OrderID', 'Lead Order']]
df_stack_ids = df_stack_ids.sort_values(by=['OrderID'])
df_stack_ids.index = range(len(df_stack_ids))
df_find_lead = pd.DataFrame({'OrderID': ['0001', '0002', '0005']})
df_find_lead = df_find_lead.astype(str)
df_found_lead = pd.merge(df_find_lead, df_stack_ids, on=['OrderID'], how='inner')
df_found_lead.astype(int)