Это более чистый, но все еще медленный код для ~ 500К строк на 25 столбцов. Это все еще занимает часы. Если у кого-то есть быстрый ответ, пожалуйста, поделитесь.
df_6 =[[]]
def occ_code_growths(df, df_6):
df_occ_unique = df.occ_code.unique()
# print(df_occ_unique)
df_st_unique = df.st.unique()
# print(df_st_unique)
df_year_unique = df.year.unique()
# print(df_yr_unique)
df_6 = pd.DataFrame({ 'idx_row': [], 'tot_growth': [], 'median_growth': [], 'code': [], 'st': [], 'yr': []})
# for i in range(len(df)):
# print('i',i)
for code in range(len(df_occ_unique)):
# print('code',code)
for st in range(len(df_st_unique)):
# print('st',st)
try:
idx_emp_min = df[(df['occ_code'] == df_occ_unique[code]) & (df['st'] == df_st_unique[st]) ]['tot_emp'].values.astype(int).min()
idx_median_min = df[(df['occ_code'] == df_occ_unique[code]) & (df['st'] == df_st_unique[st]) ]['a_median'].values.astype(int).min()
except:
print('Error with', tot_emp_growth, a_median_growth, df_occ_unique[code], df_st_unique[st], df_year_unique[yr].astype(int) )
for yr in range(len(df_year_unique)):
# print('yr',yr)
try:
idx_emp = df[(df['occ_code'] == df_occ_unique[code]) & (df['st'] == df_st_unique[st]) \
& (df['year'] == df_year_unique[yr]) ]['tot_emp'].values.astype(int)
idx_median = df[(df['occ_code'] == df_occ_unique[code]) & (df['st'] == df_st_unique[st]) \
& (df['year'] == df_year_unique[yr]) ]['a_median'].values.astype(int)
idx_row = df[(df['occ_code'] == df_occ_unique[code]) & (df['st'] == df_st_unique[st]) \
& (df['year'] == df_year_unique[yr]) ].index.values.astype(int)
except:
print('Error with', tot_emp_growth, a_median_growth, df_occ_unique[code], df_st_unique[st], df_year_unique[yr].astype(int) )
try:
tot_emp_growth = float((((idx_emp - idx_emp_min) / idx_emp_min) * 100)[0])
a_median_growth = float((((idx_median - idx_median_min) / idx_median_min) * 100)[0])
df_6 = df_6.append({'idx_row': idx_row, 'tot_growth': tot_emp_growth, \
'median_growth': a_median_growth, 'code': df_occ_unique[code], \
'st': df_st_unique[st], 'yr': df_year_unique[yr].astype(int) }, ignore_index=True)
except:
print('Error with', tot_emp_growth, a_median_growth, df_occ_unique[code], df_st_unique[st], df_year_unique[yr].astype(int) )
# print(df_6)
# if i % 200 == 0 :
# print(df.index[i])
return(df, df_6)
df_5, df_7 = occ_code_growths(df, df_6)