Я пытаюсь применить рабочий скрипт (code2
на input-2
), который преобразует запись строки фрейма данных в отдельные строки в моем наборе данных (output-1
), который выглядит как input-2
. Странно, я получаю ValueError: ошибка нульмерных массивов. Я гуглил и искал StackOverflow, но безуспешно.
Ошибка
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-310-36a8923ddf69> in <module>
----> 1 explode(df3, ['longitude','altitude', 'speed'], fill_value='',preserve_index=True)
<ipython-input-308-31db0d7d3bc1> in explode(df, lst_cols, fill_value, preserve_index)
19 index=idx)
20 .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
---> 21 for col in lst_cols}))
22 # append those rows that have empty lists
23 if (lens == 0).any():
<ipython-input-308-31db0d7d3bc1> in <dictcomp>(.0)
19 index=idx)
20 .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
---> 21 for col in lst_cols}))
22 # append those rows that have empty lists
23 if (lens == 0).any():
ValueError: zero-dimensional arrays cannot be concatenated
Json файл (soverflowq. json)
{'longitude': [24.64977040886879, 24.65014273300767, 24.6501], 'altitude': [41.2, 34.2, 24.6501],'url': 'https://www.endomondo.com/users/10921916/workouts/392337037', 'userId': 10921916, 'speed': [9.0792, 13.284, 24.6501]}
{'longitude': [22.44977040886879, 27.65014273300767, 24.6501], 'altitude': [38.4, 39.0, 24.6501],'url': 'https://www.endomondo.com/users/10921915/workouts/392337038', 'userId': 10921915, 'speed': [9.0792, 13.284, 24.6501]}
{'longitude': [24.64977040886879, 24.65014273300767, 24.6501], 'altitude': [41.2, 34.2, 24.6501],'url': 'https://www.endomondo.com/users/10921916/workouts/392337037', 'userId': 1092116, 'speed': [9.0792, 13.284, 24.6501]}
{'longitude': [22.44977040886879, 27.65014273300767, 24.6501], 'altitude': [38.4, 39.0, 24.6501],'url': 'https://www.endomondo.com/users/10921915/workouts/392337038', 'userId': 1092191, 'speed': [9.0792, 13.284, 24.6501]}
Код-1
import pandas as pd
import re
# read a file line-by-line into a list
with open('soverflowq.json') as f:
data = f.readlines()
# split each line into one column
data = [re.split(r'\s+(?=\d+$)', l) for l in data]
# constructing dataframe
df = pd.DataFrame(data, columns=['all_cols'])
# split further and rename the cols
df[['new0','new1', 'new2', 'new3', 'new4', 'new5', 'new6', 'new7', 'new8', 'new9']] = df['all_cols'].str.split(r"\, '|\,'|\':",expand=True)
# remove [, ], ', }, \n
cols_to_check = ['new0','new1','new3','new5', 'new9']
df[cols_to_check] = df[cols_to_check].replace({'\'':''}, regex=True)
df[cols_to_check] = df[cols_to_check].replace({'{':''}, regex=True)
df[cols_to_check] = df[cols_to_check].replace({'}':''}, regex=True)
df[cols_to_check] = df[cols_to_check].replace({'\n':''}, regex=True)
df[cols_to_check] = df[cols_to_check].replace({'\n':''}, regex=True)
#df[cols_to_check] = df[cols_to_check].replace({'\[':''}, regex=True)
#df[cols_to_check] = df[cols_to_check].replace({'\]':''}, regex=True)
#drop cols
df1= df.drop(columns=['all_cols', 'new0', 'new2', 'new4', 'new6', 'new8'])
# rename cols
df1.columns = ['longitude', 'altitude', 'url', 'userId', 'speed']
# reorder cols
column_titles =['userId', 'longitude', 'altitude', 'speed', 'url']
df2 = df1.reindex(columns=column_titles)
df3= df2.drop(columns=['url'])
print(df3)
Выход-1
userId longitude \
0 10921916 [24.64977040886879, 24.65014273300767, 24.6501]
1 10921915 [22.44977040886879, 27.65014273300767, 24.6501]
2 1092116 [24.64977040886879, 24.65014273300767, 24.6501]
3 1092191 [22.44977040886879, 27.65014273300767, 24.6501]
altitude speed
0 [41.2, 34.2, 24.6501] [9.0792, 13.284, 24.6501]
1 [38.4, 39.0, 24.6501] [9.0792, 13.284, 24.6501]
2 [41.2, 34.2, 24.6501] [9.0792, 13.284, 24.6501]
3 [38.4, 39.0, 24.6501] [9.0792, 13.284, 24.6501]
Код-2
def explode(df, lst_cols, fill_value='', preserve_index=False):
# make sure `lst_cols` is list-alike
if (lst_cols is not None
and len(lst_cols) > 0
and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
lst_cols = [lst_cols]
# all columns except `lst_cols`
idx_cols = df.columns.difference(lst_cols)
# calculate lengths of lists
lens = df[lst_cols[0]].str.len()
# preserve original index values
idx = np.repeat(df.index.values, lens)
# create "exploded" DF
res = (pd.DataFrame({
col:np.repeat(df[col].values, lens)
for col in idx_cols},
index=idx)
.assign(**{col:np.concatenate((df.loc[lens>0, col].values))
for col in lst_cols}))
# append those rows that have empty lists
if (lens == 0).any():
# at least one list in cells is empty
res = (res.append(df.loc[lens==0, idx_cols], sort=False)
.fillna(fill_value))
# revert the original index order
res = res.sort_index()
# reset index if requested
if not preserve_index:
res = res.reset_index(drop=True)
return res
Вход-2
df = pd.DataFrame({
'userid': {0: 10, 1: 2, 2: 12, 3: 13},
'longitude': {0: [24.64977040886879, 24.64977040886879, 324.64977040886879],
1: [24.64977040886879,424.64977040886879,524.64977040886879],
2: [124.64977040886879,224.64977040886879 ,224.64977040886879],
3: [124.64977040886879,324.64977040886879,424.64977040886879]},
'altitude': {0: [24.64977040886879, 24.64977040886879, 324.64977040886879],
1: [24.64977040886879,424.64977040886879,524.64977040886879],
2: [124.64977040886879,224.64977040886879 ,224.64977040886879],
3: [124.64977040886879,324.64977040886879,424.64977040886879]},
'speed': {0: [24.64977040886879, 24.64977040886879, 324.64977040886879],
1: [24.64977040886879,424.64977040886879,524.64977040886879],
2: [124.64977040886879,224.64977040886879 ,224.64977040886879],
3: [124.64977040886879,324.64977040886879,424.64977040886879]},
})
explode(df, ['longitude','altitude', 'speed'], fill_value='',preserve_index=True)
Ouput-2
userid longitude altitude speed
0 10 24.64977 24.64977 24.64977
0 10 24.64977 24.64977 24.64977
0 10 324.64977 324.64977 324.64977
1 2 24.64977 24.64977 24.64977
1 2 424.64977 424.64977 424.64977
1 2 524.64977 524.64977 524.64977
2 12 124.64977 124.64977 124.64977
2 12 224.64977 224.64977 224.64977
2 12 224.64977 224.64977 224.64977
3 13 124.64977 124.64977 124.64977
3 13 324.64977 324.64977 324.64977
3 13 424.64977 424.64977 424.64977