Извлечение значений из длинной строки на основе строк с разделителями - PullRequest
2 голосов
/ 10 марта 2020

Если у меня есть строка ниже, как я могу извлечь, например, 0,01 между <td>WSI_05</td> <td> и </td>?

<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>

Ответы [ 4 ]

2 голосов
/ 10 марта 2020

вы можете использовать bs4. BeeautifulSoup :

from bs4 import BeautifulSoup
soup = BeautifulSoup(my_html, 'html.parser')

all_td = [e.text for e in soup.find_all('td')]
[(t1, t2) for t1, t2 in zip(all_td, all_td[1:]) if  t1.startswith('WSI_')]

выход:

[('WSI_01', '0.01'),
 ('WSI_02', '0.01'),
 ('WSI_03', '0.01'),
 ('WSI_04', '0.01'),
 ('WSI_05', '0.01'),
 ('WSI_06', '0.01'),
 ('WSI_07', '0.01'),
 ('WSI_08', '0.01'),
 ('WSI_09', '0.01'),
 ('WSI_10', '0.01'),
 ('WSI_11', '0.01'),
 ('WSI_12', '0.01')]
1 голос
/ 10 марта 2020
print re.findall('<td>([^<]*)</td>[^<]*<td>([^<]*)</td>', data)

выход

[('Wweigh_mWSI', '0.01'), ('FID', '0'), ('BAS34S_ID', '1'), ('ROUT_AREA', '28'), ('COUNTRY_NR', '304'), ('AVL6190_KM', '0.00002'), ('TOTWWD95_K', '0'), ('WTA_95', '0'), ('WSI_01', '0.01'), ('WSI_02', '0.01'), ('WSI_03', '0.01'), ('WSI_04', '0.01'), ('WSI_05', '0.01'), ('WSI_06', '0.01'), ('WSI_07', '0.01'), ('WSI_08', '0.01'), ('WSI_09', '0.01'), ('WSI_10', '0.01'), ('WSI_11', '0.01'), ('WSI_12', '0.01'), ('Avg_mWSI', '0.01'), ('GeoM_mWSI', '0.01'), ('orig_WSI', '0.01')]

1 голос
/ 10 марта 2020

Использование BeautifulSoup

Пример:

from bs4 import BeautifulSoup

html = """<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>"""
soup = BeautifulSoup(html, "html.parser")
print(soup.find('td', text='WSI_05').findNext('td').text)
# --> 0.01
1 голос
/ 10 марта 2020
import re

data = '<html xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:msxsl="urn:schemas-microsoft-com:xslt"> <head> <META http-equiv="Content-Type" content="text/html"> </head> <body style="margin:0px 0px 0px 0px;overflow:auto;background:#FFFFFF;"> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-collapse:collapse;padding:3px 3px 3px 3px"> <tr style="text-align:center;font-weight:bold;background:#9CBCE2"> <td>1</td> </tr> <tr> <td> <table style="font-family:Arial,Verdana,Times;font-size:12px;text-align:left;width:100;border-spacing:0px; padding:3px 3px 3px 3px"> <tr> <td>Wweigh_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>FID</td> <td>0</td> </tr> <tr> <td>BAS34S_ID</td> <td>1</td> </tr> <tr bgcolor="#D4E4F3"> <td>ROUT_AREA</td> <td>28</td> </tr> <tr> <td>COUNTRY_NR</td> <td>304</td> </tr> <tr bgcolor="#D4E4F3"> <td>AVL6190_KM</td> <td>0.00002</td> </tr> <tr> <td>TOTWWD95_K</td> <td>0</td> </tr> <tr bgcolor="#D4E4F3"> <td>WTA_95</td> <td>0</td> </tr> <tr> <td>WSI_01</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_02</td> <td>0.01</td> </tr> <tr> <td>WSI_03</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_04</td> <td>0.01</td> </tr> <tr> <td>WSI_05</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_06</td> <td>0.01</td> </tr> <tr> <td>WSI_07</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_08</td> <td>0.01</td> </tr> <tr> <td>WSI_09</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_10</td> <td>0.01</td> </tr> <tr> <td>WSI_11</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>WSI_12</td> <td>0.01</td> </tr> <tr> <td>Avg_mWSI</td> <td>0.01</td> </tr> <tr bgcolor="#D4E4F3"> <td>GeoM_mWSI</td> <td>0.01</td> </tr> <tr> <td>orig_WSI</td> <td>0.01</td> </tr> </table> </td> </tr> </table> </body> </html>'

results = re.findall('<td>WSI_05</td> <td>(.*?)</td>', data)
print(results)

выход:

['0.01']
...