concat множественный df, сделанный из pd.read_html - PullRequest
0 голосов
/ 11 ноября 2019

мой заголовок не имеет никакого смысла, поэтому я собираюсь кратко рассказать о ситуации

Я собираю данные с сайта, который в основном является таблицей, но в данном случае каждая строка является элементом таблицы, а также каждым нечетным элементом таблицыэто бесполезно, поэтому я исключаю

, поэтому мне нужно объединить каждый отдельный фрейм данных, состоящий из каждого элемента таблицы, используя read_html ()

, ниже мой код

import pandas as pd

all_table = ["""<table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr height="10px">
<td align="right" colspan="9">
<font color="#D5D5D5">.</font>
</td>
</tr>
<tr height="30px" valign="middle" width="100%">
<td class="size-12" colspan="8" width="100%">
<strong>Shipment Status</strong>
</td>
</tr>
<tr valign="bottom" width="100%">
<td align="center" class="size-10" width="10%">
<strong>Station</strong>
</td>
<td align="center" class="size-10" width="10%">
<strong>Flight No.</strong>
</td>
<td align="center" class="size-10" width="25%">
<strong>Status</strong>
</td>
<td align="center" class="size-10" width="15%">
<strong>Date</strong>
</td>
<td align="center" class="size-10" width="9%">
<strong>Time</strong>
</td>
<td align="center" class="size-10" width="8%">
<strong>Pcs</strong>
</td>
<td align="center" class="size-10" width="8%">
<strong>Wgt</strong>
</td>
<td align="center" class="size-10" width="15%">
<strong>ULD - Battery - Temp</strong>
</td>
</tr>
<tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">KIX</td>
<td align="center" class="size-10" width="10%">
<center>-</center>
</td>
<td align="center" class="size-10" width="25%">Shipment Received</td>
<td align="center" class="size-10" width="15%">11 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 22:45</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">KIX</td>
<td align="center" class="size-10" width="10%">
<center>-</center>
</td>
<td align="center" class="size-10" width="25%">Freight On Hand</td>
<td align="center" class="size-10" width="15%">11 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 22:45</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">KIX</td>
<td align="center" class="size-10" width="10%">SQ0621</td>
<td align="center" class="size-10" width="25%">Flight  Departed</td>
<td align="center" class="size-10" width="15%">13 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 17:18</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">SIN</td>
<td align="center" class="size-10" width="10%">SQ0621</td>
<td align="center" class="size-10" width="25%">Flight Arrived</td>
<td align="center" class="size-10" width="15%">13 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 23:02</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">SIN</td>
<td align="center" class="size-10" width="10%">SQ0621</td>
<td align="center" class="size-10" width="25%">Flight Arrived</td>
<td align="center" class="size-10" width="15%">13 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 23:02</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">SIN</td>
<td align="center" class="size-10" width="10%">SQ0621</td>
<td align="center" class="size-10" width="25%">Shipment Checked Into Warehouse</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 02:57</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">SIN</td>
<td align="center" class="size-10" width="10%">SQ0422</td>
<td align="center" class="size-10" width="25%">Flight  Departed</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 07:39</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">SQ0422</td>
<td align="center" class="size-10" width="25%">Flight Arrived</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 10:12</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">SQ0422</td>
<td align="center" class="size-10" width="25%">Flight Arrived</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 10:30</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">SQ0422</td>
<td align="center" class="size-10" width="25%">Shipment Checked Into Warehouse</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 14:10</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">
<center>-</center>
</td>
<td align="center" class="size-10" width="25%">Shipment Ready for Pick-up</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 14:21</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#FFFFFF" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">
<center>-</center>
</td>
<td align="center" class="size-10" width="25%">Document Delivered</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 17:15</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
</table>, <table cellpadding="0" cellspacing="0" cols="8" width="100%">
<tbody><tr bgcolor="#F0F0F0" class="result-row">
<td align="center" class="size-10" width="10%">BOM</td>
<td align="center" class="size-10" width="10%">
<center>-</center>
</td>
<td align="center" class="size-10" width="25%">Shipment Delivered</td>
<td align="center" class="size-10" width="15%">14 Oct 2019</td>
<td align="center" class="size-10" width="9%"> 17:15</td>
<td align="center" class="size-10" width="8%">34</td>
<td align="center" class="size-10" width="8%">411.3</td>
<td align="center" class="size-10" width="15%"></td>
</tr>
</tbody></table>"""]

final_delivary = pd.DataFrame()

a = 0
for i in range(len(all_table)):
    print("-"*150)
    if a % 2 == 0:
        print(a)
        # print(all_table[a])
        tmp_table = all_table[a]
        tmp_df = pd.read_html(str(tmp_table))
        print("tmp_df = \n", tmp_df)
        print("type of tmp_df = ", type(tmp_df))
        print("#"*75)
        tmp_df2 = pd.DataFrame(tmp_df[0])
        print("tmp_df2 = \n", tmp_df2)
        print("type of tmp_df2 = ", type(tmp_df2))
        print("@"*75)
        print("final_delivary = \n", final_delivary)
        print("type of final_delivary = ", type(final_delivary))
        pd.concat([final_delivary, tmp_df2], axis=0)
    else:
        print("nope")
    a+=1

print("final_delivary = ", final_delivary)

, поэтому я столкнулся с проблемой при объединении отдельного фрейма данных в основной фрейм, и результат, который я получаю, является пустым фреймом, поэтому, пожалуйста, помогите мне с этим

1 Ответ

0 голосов
/ 11 ноября 2019

Попробуйте это

from bs4 import BeautifulSoup as bs
import pandas as pd
all_table = '''
                html content
            '''
finalDf = pd.DataFrame()
soup = bs(all_table)
tables = soup.findAll("table")
for i,table in enumerate(tables):
    if i%2==0:  
        df = pd.read_html(str(table))
        finalDf = pd.concat([finalDf,df[0]])
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...