Я разработал код, как показано ниже, он работает, но, возможно, не идеален с точки зрения элегантности кодирования.
def getRaceInfo(rawString):
pattern = re.compile(r"(<span style='font-size: 12px;'>Race).*?\d+(<br />)", re.IGNORECASE)
matches = pattern.finditer(rawString)
#print(matches)
spanlist = []
for m in matches:
spanlist.append(m.span())
raceI= []
replacements = [("<span style='font-size: 12px;'>", ''),
(' ', '|'), (' ', ''),
('<span>', ''), ('<br />', ''), ('<strong>', ''), ('</strong>',''),
('</span>', '|'), ('||', '|')
]
for (i, j) in spanlist:
nString = rawString[i:j]
for t, r in replacements:
nString = nString.replace(t, r)
raceI.append(nString)
if len(raceI) != 0:
df = pd.DataFrame(raceI,columns=['temp'])
df[['RACE_NUM','Race_Grade','Race_Distance','Race_PrizeMoney1','Race_PrizeMoney2']] = df['temp'].str.split('|',expand=True)
df['Race_Other'] = np.nan
try:
df[['RACE_NUM','RACE_Detail']] = df['RACE_NUM'].str.split(' :: ',expand=True)
except:
df['RACE_Detail']=np.nan
df = df.drop(['temp'], axis=1)
else:
pattern = re.compile(r"(font-size: 12px;\'>Race).*?(</span><br /><br /></span><table>)", re.IGNORECASE)
matches = pattern.finditer(rawString)
spanlist = []
for m in matches:
spanlist.append(m.span())
raceI= []
replacements = [("font-size: 12px;\'>", ''), ("<span style='", ''), ('<table>', ''),
(' ', '|'), (' ', ''),
('<span>', ''), ('<br />', ''), ('<strong>', ''), ('</strong>',''),
('</span>', '|'), ('||', '|'), ('><span style=\'font-size: 12px;\'>', '|')
]
for (i, j) in spanlist:
nString = rawString[i:j]
for t, r in replacements:
nString = nString.replace(t, r)
raceI.append(nString)
if len(raceI) != 0:
df = pd.DataFrame(raceI,columns=['temp'])
df[['RACE_NUM','Race_Grade','Race_Distance','Race_Other', 'Race_PrizeMoney1']] = df['temp'].str.split('|',expand=True)
df['Race_PrizeMoney2'] = np.nan
try:
df[['RACE_NUM','RACE_Detail']] = df['RACE_NUM'].str.split(' :: ',expand=True)
except:
df['RACE_Detail']=np.nan
df = df.drop(['temp'], axis=1)
else:
df = pd.DataFrame(columns=['RACE_NUM','Race_Grade','Race_Distance','Race_Other', 'RACE_Detail', 'Race_PrizeMoney1', 'Race_PrizeMoney2'])
return df