Я программирую скрипт Python 2.7 для импорта нескольких локальных файлов HTML (каждый файл данных ~ 3 МБ).Я использую пакет BeautifulSoup4, но у меня возникает ошибка памяти при импорте более 15 файлов HTML.Мой код:
from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #
directory ='C:\Users\khoipt\Desktop\data_extraction\Calib_SMP_100\HTML\HTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break
if flag == 0:
print('There is no HTML file')
fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")
# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================
# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================
# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================
# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================
# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])
print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)
print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================
# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========
cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]
line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1
x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0
# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################
for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()
table_no = [None]*counter_table
table_no = []
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no = []
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================
Мой HTML-файл экспортируется с оборудования и не похож на традиционные HTML-файлы.Он содержит только описания таблиц.К сожалению, я не могу изменить структуру экспортированного HTML-файла.
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>
Ошибка при обработке более 15 HTML-файлов:
Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".\html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:\Python27\lib\site-packages\beautifulsoup4-4.6.3-py2.7.egg\bs4\__init__.py", line 282, in __init__
self._feed()
File "C:\Python27\lib\site-packages\beautifulsoup4-4.6.3-py2.7.egg\bs4\__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError
Я проверил это, когда у меня только чтопроанализировав HTML-файл с помощью BS4 без таблицы find_all, код может обработать более 15 файлов.Я также пытаюсь использовать BS4 только для первого файла, а затем нахожу номер строки нужного содержимого, однако я не могу найти номер строки этих строк, используя BS4.
Не могли бы вы взглянутьи дать мне свои комментарии, как это возможно?Большое спасибо.