Вы можете просто использовать pandas
функцию read_html()
. Он использует BeautifulSoup под капотом для анализа <table>
теги в html.
import pandas as pd
url = 'https://www.theguardian.com/world/ng-interactive/2020/apr/13/coronavirus-map-us-latest-covid-19-cases-state-by-state'
df = pd.read_html(url)[0].dropna(axis=1)
print (list(df['State/territory']))
Вывод:
['New York', 'New Jersey', 'Massachusetts', 'Michigan', 'California', 'Pennsylvania', 'Illinois', 'Florida', 'Louisiana', 'Texas', 'Georgia', 'Connecticut', 'Washington', 'Maryland', 'Indiana', 'Colorado', 'Ohio', 'Virginia', 'Tennessee', 'North Carolina', 'Missouri', 'Alabama', 'Arizona', 'Wisconsin', 'South Carolina', 'Rhode Island', 'Mississippi', 'Nevada', 'Utah', 'Kentucky', 'Oklahoma', 'District of Columbia', 'Delaware', 'Iowa', 'Minnesota', 'Oregon', 'Arkansas', 'Kansas', 'Idaho', 'New Mexico', 'South Dakota', 'New Hampshire', 'Puerto Rico', 'Nebraska', 'Maine', 'Vermont', 'West Virginia', 'Hawaii', 'Montana', 'North Dakota', 'Alaska', 'Wyoming', 'Guam', 'Virgin Islands']