У меня длинный xml со следующей структурой:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<SoccerFeed timestamp="20180212T2228+0000">
<Game id="3323" home_team_id="83" home_team_name="CFC" away_team_id="335" away_team_name="WCFC" game_date="22180212T2000+0000" competition_id="18" competition_name="DCF" season_id="2011" season_name="Season 2017/2018" matchday="27" period_1_start="22180212T2000+0000" period_2_start="22180212T2105+0000" Status="FullTime">
<Event id="1182774006" event_id="10223" type_id="37" period_id="14" min="0" sec="0" player_id="1212" team_id="83" outcome="1" x="45.7" y="59.0" timestamp="2018-02-12 21:52:45" last_modified="2018-02-12 21:52:48" version="15184723628801">
<Q id="16517621865" qualifier_id="302"/>
</Event>
<Event id="120438223891" event_id="8726" type_id="30" period_id="14" min="0" sec="0" player_id="" team_id="335" outcome="0" x="" y="" timestamp="2018-02-12 21:52:46" last_modified="2018-02-12 21:52:46" version="15128472366490">
<Q id="125850245236" qualifier_id="57" value="1.0"/>
<Q id="1775059222" qualifier_id="209"/>
</Event>
</Game>
</SoccerFeed>
Файл обычно содержит 1500+ узлов событий.
Мой текущий код анализирует файл за 20-21 секунду, но я чувствую, что есть некоторые очевидные улучшения производительности, которые мне не хватает.Любые советы будут оценены.
Я уверен, что есть лучшие способы получить доступ ко всем атрибутам и их значениям без необходимости определять каждый из них по отдельности.
Текущий код
import xml.etree.ElementTree as ET
import pandas as pd
# define the location of your file
file_name = "possessions.xml"
# parse the xml and conver to a tree and root
tree = ET.parse(file_name)
root = tree.getroot()
## get the main game info from the single 'Game' node
gameinfo = root.findall('Game')
gameinfo = gameinfo[0]
game_id = gameinfo.get('id')
home_team_id = gameinfo.get('home_team_id')
home_team_name = gameinfo.get('home_team_name')
away_team_id = gameinfo.get('away_team_id')
away_team_name = gameinfo.get('away_team_name')
competition_id = gameinfo.get('competition_id')
competition_name = gameinfo.get('competition_name')
season_id = gameinfo.get('season_id')
## create an empty dataframe to store the per event information
catcher = pd.DataFrame()
# loop through each event node and grab the information
for i in root.iter('Event'):
# get the info from the Event node main chunk
id = i.get('id')
event_id = i.get('event_id')
type_id = i.get('type_id')
period_id = i.get('period_id')
min = i.get('min')
sec = i.get('sec')
player_id = i.get('player_id')
team_id = i.get('team_id')
x = i.get('x')
y = i.get('y')
possession_id = i.get('possession_id')
sequence_id = i.get('sequence_id')
# combine the event main node data into a dataframe
Edata = pd.DataFrame(
{'id': id,
'event_id': event_id,
'type_id': type_id,
'period_id': period_id,
'min': min,
'sec': sec,
'player_id': player_id,
'team_id': team_id,
'x': x,
'y': y,
'sequence_id': sequence_id,
'possession_id':possession_id
}, index = [0])
# find all of the Q information for that file
Qs = i.findall("./Q")
# create some empty lists to append the results to
qualifier_id = []
Q_value = []
# loop through all of the Qs and grab the info
for child in Qs:
qualifier_id.append(child.get('qualifier_id'))
Q_value.append(child.get('value'))
# create a dataframe from the two lists
Qdata = pd.DataFrame(
{'qualifier_id': qualifier_id,
'value': Q_value
})
# transform the dataframe, 1st row to the headers,
# drop the extra row, reset the index
Qdata = Qdata.T
Qdata.columns = Qdata.iloc[0]
Qdata = Qdata.reindex(Qdata.index.drop('qualifier_id'))
Qdata = Qdata.reset_index(drop=True)
# combine the event info and qualifer info
result = pd.concat([Qdata, Edata], axis=1)
# bind to the catcher dataframe
catcher = catcher.append(result)
# Add all of the the game information to each row
catcher['competition_id'] = competition_id
catcher['game_id'] = game_id
catcher['home_team_id'] = home_team_id
catcher['home_team_name'] = home_team_name
catcher['away_team_id'] = away_team_id
catcher['away_team_name'] = away_team_name
catcher['competition_id'] = competition_id
catcher['competition_name'] = competition_name
catcher['season_id'] = season_id
catcher['competition_id'] = competition_id