Повышение производительности разбора XML на данные - PullRequest
0 голосов
/ 17 ноября 2018

У меня длинный xml со следующей структурой:

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<SoccerFeed timestamp="20180212T2228+0000">
 <Game id="3323" home_team_id="83" home_team_name="CFC" away_team_id="335" away_team_name="WCFC" game_date="22180212T2000+0000" competition_id="18" competition_name="DCF" season_id="2011" season_name="Season 2017/2018" matchday="27" period_1_start="22180212T2000+0000" period_2_start="22180212T2105+0000" Status="FullTime">
    <Event id="1182774006" event_id="10223" type_id="37" period_id="14" min="0" sec="0" player_id="1212" team_id="83" outcome="1" x="45.7" y="59.0" timestamp="2018-02-12 21:52:45" last_modified="2018-02-12 21:52:48" version="15184723628801">
   <Q id="16517621865" qualifier_id="302"/>
  </Event>
  <Event id="120438223891" event_id="8726" type_id="30" period_id="14" min="0" sec="0" player_id="" team_id="335" outcome="0" x="" y="" timestamp="2018-02-12 21:52:46" last_modified="2018-02-12 21:52:46" version="15128472366490">
   <Q id="125850245236" qualifier_id="57" value="1.0"/>
   <Q id="1775059222" qualifier_id="209"/>
  </Event>
 </Game>
</SoccerFeed>

Файл обычно содержит 1500+ узлов событий.

Мой текущий код анализирует файл за 20-21 секунду, но я чувствую, что есть некоторые очевидные улучшения производительности, которые мне не хватает.Любые советы будут оценены.

Я уверен, что есть лучшие способы получить доступ ко всем атрибутам и их значениям без необходимости определять каждый из них по отдельности.

Текущий код

import xml.etree.ElementTree as ET
import pandas as pd

# define the location of your file
file_name = "possessions.xml"

# parse the xml and conver to a tree and root
tree = ET.parse(file_name)
root = tree.getroot()

## get the main game info from the single 'Game' node
gameinfo = root.findall('Game')
gameinfo = gameinfo[0]
game_id = gameinfo.get('id')
home_team_id = gameinfo.get('home_team_id')
home_team_name = gameinfo.get('home_team_name')
away_team_id = gameinfo.get('away_team_id')
away_team_name = gameinfo.get('away_team_name')
competition_id = gameinfo.get('competition_id')
competition_name = gameinfo.get('competition_name')
season_id = gameinfo.get('season_id')

## create an empty dataframe to store the per event information
catcher = pd.DataFrame()

# loop through each event node and grab the information
for i in root.iter('Event'):

    # get the info from the Event node main chunk
    id = i.get('id')
    event_id = i.get('event_id')
    type_id = i.get('type_id')
    period_id = i.get('period_id')
    min = i.get('min')
    sec = i.get('sec')
    player_id = i.get('player_id')
    team_id = i.get('team_id')
    x = i.get('x')
    y = i.get('y')
    possession_id = i.get('possession_id')
    sequence_id = i.get('sequence_id')

    # combine the event main node data into a dataframe
    Edata = pd.DataFrame(
        {'id': id,
         'event_id': event_id,
         'type_id': type_id,
         'period_id': period_id,
         'min': min,
         'sec': sec,
         'player_id': player_id,
         'team_id': team_id,
         'x': x,
         'y': y,
         'sequence_id': sequence_id,
         'possession_id':possession_id
        }, index = [0])

    # find all of the Q information for that file
    Qs = i.findall("./Q")

    # create some empty lists to append the results to
    qualifier_id = []
    Q_value = []

    # loop through all of the Qs and grab the info
    for child in Qs:
        qualifier_id.append(child.get('qualifier_id'))
        Q_value.append(child.get('value'))

    # create a dataframe from the two lists
    Qdata = pd.DataFrame(
    {'qualifier_id': qualifier_id,
     'value': Q_value
    })

    # transform the dataframe, 1st row to the headers,
    # drop the extra row, reset the index
    Qdata = Qdata.T
    Qdata.columns = Qdata.iloc[0]
    Qdata = Qdata.reindex(Qdata.index.drop('qualifier_id'))
    Qdata = Qdata.reset_index(drop=True)

    # combine the event info and qualifer info
    result = pd.concat([Qdata, Edata], axis=1)

    # bind to the catcher dataframe
    catcher = catcher.append(result)

# Add all of the the game information to each row
catcher['competition_id'] = competition_id
catcher['game_id'] = game_id
catcher['home_team_id'] = home_team_id
catcher['home_team_name'] = home_team_name
catcher['away_team_id'] = away_team_id
catcher['away_team_name'] = away_team_name
catcher['competition_id'] = competition_id
catcher['competition_name'] = competition_name
catcher['season_id'] = season_id
catcher['competition_id'] = competition_id
...