В случае столбцов со значениями NaN
, которые нужно сложить вместе, есть некоторая логика:
def add_cols_w_nan(df, col_list, space_char, new_col_name):
""" Add together multiple columns where some of the columns
may contain NaN, with the appropriate amount of spacing between columns.
Examples:
'Mr.' + NaN + 'Smith' becomes 'Mr. Smith'
'Mrs.' + 'J.' + 'Smith' becomes 'Mrs. J. Smith'
NaN + 'J.' + 'Smith' becomes 'J. Smith'
Args:
df: pd.DataFrame
DataFrame for which strings are added together.
col_list: ORDERED list of column names, eg. ['first_name',
'middle_name', 'last_name']. The columns will be added in order.
space_char: str
Character to insert between concatenation of columns.
new_col_name: str
Name of the new column after adding together strings.
Returns: pd.DataFrame with a string addition column
"""
df2 = df[col_list].copy()
# Convert to strings, leave nulls alone
df2 = df2.where(df2.isnull(), df2.astype('str'))
# Add space character, NaN remains NaN, which is important
df2.loc[:, col_list[1:]] = space_char + df2.loc[:, col_list[1:]]
# Fix rows where leading columns are null
to_fix = df2.notnull().idxmax(1)
for col in col_list[1:]:
m = to_fix == col
df2.loc[m, col] = df2.loc[m, col].str.replace(space_char, '')
# So that summation works
df2[col_list] = df2[col_list].replace(np.NaN, '')
# Add together all columns
df[new_col_name] = df2[col_list].sum(axis=1)
# If all are missing replace with missing
df[new_col_name] = df[new_col_name].replace('', np.NaN)
del df2
return df
Пример данных:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Address 1': ['AAA', 'ABC', np.NaN, np.NaN, np.NaN],
'Address 2': ['foo', 'bar', 'baz', None, np.NaN],
'Address 3': [np.NaN, np.NaN, 17, np.NaN, np.NaN],
'city': [np.NaN, 'here', 'there', 'anywhere', np.NaN],
'state': ['NY', 'TX', 'WA', 'MI', np.NaN]})
# Address 1 Address 2 Address 3 city state
#0 AAA foo NaN NaN NY
#1 ABC bar NaN here TX
#2 NaN baz 17.0 there WA
#3 NaN None NaN anywhere MI
#4 NaN NaN NaN NaN NaN
df = add_cols_w_nan(
df,
col_list = ['Address 1', 'Address 2', 'Address 3', 'city', 'state'],
space_char = ', ',
new_col_name = 'full_address')
df.full_address.tolist()
#['AAA, foo, NY',
# 'ABC, bar, here, TX',
# 'baz, 17.0, there, WA',
# 'anywhere, MI',
# nan]