Хорошо, вот моя попытка сделать это (библиотека json предназначена только для красивой печати словаря):
import json
from bs4 import BeautifulSoup
import requests
url = "https://en.wikipedia.org/wiki/ABC_Studios"
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
tbl = soup.find("table", {"class": "infobox vcard"})
list_of_table_rows = tbl.findAll('tr')
info = {}
for tr in list_of_table_rows:
th = tr.find("th")
td = tr.find("td")
if th is not None:
innerText = ''
for elem in td.recursiveChildGenerator():
if isinstance(elem, str):
innerText += elem.strip()
elif elem.name == 'br':
innerText += '\n'
info[th.text] = innerText
print(json.dumps(info, indent=1))
Код заменяет теги <br/>
на \n
, что дает:
{
"Trading name": "ABC Studios",
"Type": "Subsidiary\nLimited liability company",
"Industry": "Television production",
"Predecessor": "Touchstone Television",
"Founded": "March\u00a021, 1985; 33 years ago(1985-03-21)",
"Headquarters": "Burbank, California,U.S.",
"Area served": "Worldwide",
"Key people": "Patrick Moran (President)",
"Parent": "ABC Entertainment Group\n(Disney\u2013ABC Television Group)",
"Website": "abcstudios.go.com"
}
Вы можете настроить его, если хотите возвращать списки вместо строк с помощью \n
s
innerTextList = innerText.split("\n")
if len(innerTextList) < 2:
info[th.text] = innerTextList[0]
else:
info[th.text] = innerTextList
Что дает:
{
"Trading name": "ABC Studios",
"Type": [
"Subsidiary",
"Limited liability company"
],
"Industry": "Television production",
"Predecessor": "Touchstone Television",
"Founded": "March\u00a021, 1985; 33 years ago(1985-03-21)",
"Headquarters": "Burbank, California,U.S.",
"Area served": "Worldwide",
"Key people": "Patrick Moran (President)",
"Parent": [
"ABC Entertainment Group",
"(Disney\u2013ABC Television Group)"
],
"Website": "abcstudios.go.com"
}