Работа с подмножеством ваших данных, например,
df_data = [[888888, 3, 0, 0],
[677767, 0, 2, 1],
[212341212, 0, 0, 0],
[141414141414, 0, 0, 0],
[1112224, 0, 0, 0]]
# Creating the data
df = pd.DataFrame(data=data, columns=['Actual_Data', '8,8', '6,6', '7,7'], dtype=np.float64)
# Which looks like
# Actual_Data 8,8 6,6 7,7
# 0 8.888880e+05 3.0 0.0 0.0
# 1 6.777670e+05 0.0 2.0 1.0
# 2 2.123412e+08 0.0 0.0 0.0
# 3 1.414141e+11 0.0 0.0 0.0
# 4 1.112224e+06 0.0 0.0 0.0
# Computing the distance matrix
dist_matrix = df.apply(lambda row: [np.linalg.norm(row.values - df.loc[[_id], :].values, 2) for _id in df.index.values], axis=1)
# Which looks like
# 0 [0.0, 211121.00003315636, 211452324.0, 141413252526.0, 223336.000020149]
# 1 [211121.00003315636, 0.0, 211663445.0, 141413463647.0, 434457.0000057543]
# 2 [211452324.0, 211663445.0, 0.0, 141201800202.0, 211228988.0]
# 3 [141413252526.0, 141413463647.0, 141201800202.0, 0.0, 141413029190.0]
# 4 [223336.000020149, 434457.0000057543, 211228988.0, 141413029190.0, 0.0]
# Reformatting the above into readable format
dist_matrix = pd.DataFrame(
data=dist_matrix.values.tolist(),
columns=df.index.tolist(),
index=df.index.tolist())
# Which gives you
# 0 1 2 3 4
# 0 0.000000e+00 2.111210e+05 2.114523e+08 1.414133e+11 2.233360e+05
# 1 2.111210e+05 0.000000e+00 2.116634e+08 1.414135e+11 4.344570e+05
# 2 2.114523e+08 2.116634e+08 0.000000e+00 1.412018e+11 2.112290e+08
# 3 1.414133e+11 1.414135e+11 1.412018e+11 0.000000e+00 1.414130e+11
# 4 2.233360e+05 4.344570e+05 2.112290e+08 1.414130e+11 0.000000e+00
Обновление
, как указано в комментариях, проблема - memory overflow
, поэтому мы должны решать проблему партиями .
# Collecting the data
# df = ....
# Set this number to a lower value if you get the same `memory` errors.
batch = 200 # #'s of row's / user's used to compute the matrix
# To be conservative, let's write the intermediate results to file type.
dffname = []
for ifile,_slice in enumerate(np.array_split(range(df.shape[0]), batch)):
# Let's compute distance for `batch` #'s of points in data frame
tmp_df = df.iloc[_slice, :].apply(lambda row: [np.linalg.norm(row.values - df.loc[[_id], :].values, 2) for _id in df.index.values], axis=1)
tmp_df = pd.DataFrame(tmp_df.values.tolist(), index=df.index.values[_slice], columns=df.index.values)
# You can change it from csv to any other files
tmp_df.to_csv(f"{ifile+1}.csv")
dffname.append(f"{ifile+1}.csv")
# Reading back the dataFrames
dflist = []
for f in dffname:
dflist.append(pd.read_csv(f, dtype=np.float64, index_col=0))
res = pd.concat(dflist)