Код: (Python 3.6+, используется f-strings
)
import urllib.parse
from collections import namedtuple
from datetime import datetime
import bs4
import requests
HostingCompany = namedtuple('HostingCompany',
('name', 'country', 'websites', 'usage', 'usage_by_top', 'update_time'))
class MyIpLink:
url_base = 'https://myip.ms'
def __init__(self, tag: bs4.element.Tag, *, is_anchor=False):
a_tag = tag.find('a')
if is_anchor: # treat `tag` as an anchor tag
a_tag = tag
self.text = tag.text.strip()
self.url = urllib.parse.urljoin(self.url_base, a_tag['href'])
def __repr__(self):
return f'{self.__class__.__name__}(text={repr(self.text)}, url={repr(self.url)})'
url = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
html = requests.get(url).text
soup = bs4.BeautifulSoup(html, 'html.parser')
rows = soup.select('#web_hosting_tbl > tbody > tr')[::2] # skips "more info" rows
companies = []
for row in rows:
tds = row.find_all('td')
name = MyIpLink(tds[1])
country = MyIpLink(tds[2])
websites = [MyIpLink(a, is_anchor=True) for a in tds[3].find_all('a')]
usage = MyIpLink(tds[4])
usage_by_top = MyIpLink(tds[5])
update_time = datetime.strptime(tds[6].text.strip(), '%d %b %Y, %H:%M')
company = HostingCompany(name, country, websites, usage, usage_by_top, update_time)
companies.append(company)
import pprint
pprint.pprint(companies)
print(companies[0].name.text)
print(companies[0].name.url)
print(companies[0].country.text)
Выход:
[HostingCompany(name=MyIpLink(text='Godaddy.com, LLC', url='https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.godaddy.com', url='https://myip.ms/go.php?1229687315_ITg7Im93dCkWE0kNAhQSEh0FUeHq5Q==')], usage=MyIpLink(text='512,701 sites', url='https://myip.ms/browse/sites/1/ownerID/2433/ownerIDii/2433'), usage_by_top=MyIpLink(text='951 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/2433/ownerIDii/2433'), update_time=datetime.datetime(2018, 5, 2, 5, 17)),
HostingCompany(name=MyIpLink(text='Cloudflare, Inc', url='https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.cloudflare.com', url='https://myip.ms/go.php?840626136_OiEsK2ROSxAdGl4QGhYJG+Tp6fnrv/f49w==')], usage=MyIpLink(text='488,119 sites', url='https://myip.ms/browse/sites/1/ownerID/4638/ownerIDii/4638'), usage_by_top=MyIpLink(text='16,160 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/4638/ownerIDii/4638'), update_time=datetime.datetime(2018, 5, 2, 5, 10)),
HostingCompany(name=MyIpLink(text='Amazon.com, Inc', url='https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.amazonaws.com', url='https://myip.ms/go.php?990446041_JyYhKGFxThMQHUMRHhcDExHj8vul7f75')], usage=MyIpLink(text='453,230 sites', url='https://myip.ms/browse/sites/1/ownerID/615/ownerIDii/615'), usage_by_top=MyIpLink(text='9,557 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/615/ownerIDii/615'), update_time=datetime.datetime(2018, 5, 2, 5, 4)),
...
]
Godaddy.com, LLC
https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html
USA
Собираюсь обновить ответ вечером с некоторыми пояснениями. Ура!