Я вроде отвечаю на мои вопросы в комментариях. Вот решение для вас. Я сделал это на Python3, а ваш тег - Python 2, поэтому вам может потребоваться изменить часть кода.
Вот весь код с самого начала.
import numpy as np
import pandas as pd
df1 = pd.DataFrame(10*(2+np.random.randn(500, 3)), columns=list('ABC'))
df2 = pd.DataFrame(10*(2+np.random.randn(500, 3)), columns=list('CDA'))
df1.drop(df1.index[[1, 4, 284, 354, 498]], inplace=True)
df2.drop(df2.index[[0, 98, 159]], inplace=True)
df1.loc[2, 'B'] = np.nan
df1.loc[5, 'C'] = np.nan
df1.loc[3, 'A'] = np.nan
df2.loc[5, 'C'] = np.nan
df2.loc[1, 'D'] = np.nan
df2.loc[2, 'A'] = np.nan
df1.head(10)
A B C
0 20.438695 18.114421 20.445370
2 12.789906 NaN 3.988319
3 NaN 11.026463 9.421921
5 19.919580 7.462012 NaN
6 21.290647 23.952295 -10.354758
7 8.447708 14.710224 25.499204
8 16.603850 24.862611 20.354342
9 6.088232 15.066117 22.906491
10 19.621493 15.877428 15.149765
11 2.052592 9.031476 19.531663
df2.head(10)
C D A
1 15.159127 NaN 29.432163
2 23.449304 8.393440 NaN
3 6.057011 32.881258 21.033391
4 31.162671 4.128745 23.264304
5 NaN 32.796018 11.171984
6 32.019817 14.603303 33.106655
7 17.566806 26.804403 12.421421
8 30.121336 46.520462 41.934098
9 13.498463 30.170049 24.221281
10 19.554489 28.238385 26.284620
# Merge the dataframes
merged_df = df1.merge(df2, right_index = True, left_index = True, how = 'outer')
# Create dataframes for each formula
final_df1 = pd.DataFrame(index = merged_df.index) # for this (1 - df1 / df2)
final_df2 = pd.DataFrame(index = merged_df.index) # for this ln(df1 / df2)
for column in merged_df.columns:
# Get initial letter from the column names
i = column[0]
# filter only the columns that start with the same letter
df_test = merged_df[merged_df.columns[merged_df.columns.str.startswith(i)]]
#If only one column, add that to the dataframes
if df_test.shape[1] == 1:
final_df1 = final_df1.merge(df_test, right_index = True, left_index = True, how = 'outer')
final_df2 = final_df2.merge(df_test, right_index = True, left_index = True, how = 'outer')
#If two columns do the calculations
else:
final_df1.loc[:,i] = np.where((df_test[df_test.columns[0]].isnull())|(df_test[df_test.columns[1]].isnull()),
np.nan, (1- df_test[df_test.columns[0]]) / df_test[df_test.columns[1]])
final_df2.loc[:,i] = np.where((df_test[df_test.columns[0]].isnull())|(df_test[df_test.columns[1]].isnull()),
np.nan, np.log(df_test[df_test.columns[0]] / df_test[df_test.columns[1]]))
#Adjust the names of the columns
final_df1.columns = final_df1.columns.str[0]
final_df2.columns = final_df2.columns.str[0]
print(final_df1.head(10), final_df2.head(10))
Даю это:
simple_ret_df.head(6)
A B C D
0 NaN 18.114421 NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN -0.127437 8.393440
3 NaN 11.026463 -1.390442 32.881258
4 NaN NaN NaN 4.128745
5 -1.693484 7.462012 NaN 32.796018
log_ret_df.head(6)
A B C D
0 NaN 18.114421 NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN -1.771471 8.393440
3 NaN 11.026463 0.441823 32.881258
4 NaN NaN NaN 4.128745
5 0.578294 7.462012 NaN 32.796018