Вы можете использовать beautifulsoup для проверки тега <div>
, который содержит количество страниц, тогда похоже, что вы можете просто перебирать их.Возможно, это будет лучший способ сделать это, но я просто добавил еще одну попытку / исключение, чтобы разобраться, если найдены дополнительные страницы:
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#create empty datframe and empty list to store urls that didn't pull a table
results_df = pd.DataFrame()
no_table = []
#Loop through the URLs retrieved previously and append to results_df
for x in df['url']:
#Check for additional pages
try:
html = requests.get(x, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
pages = soup.find('div',{'class':'pagenumbers'}).text.strip().split(' ')[-1]
for page in range(1,int(pages)+1):
page_x = x + '&lang=&page=%s' %page
try:
html = requests.get(page_x, headers=headers).text # <----- added headers
table = pd.read_html(html)[0] # <---- used pandas to read in the html and parse table tags. this will return a list of dataframes and want the dataframe in position 0
results_df = results_df.append(table, sort=True).reset_index(drop=True)
print ('Processed: %s' %page_x)
except:
print ('No table found: %s' %page_x)
no_table.append(page_x)
except:
try:
html = requests.get(x, headers=headers).text # <----- added headers
table = pd.read_html(html)[0] # <---- used pandas to read in the html and parse table tags. this will return a list of dataframes and want the dataframe in position 0
results_df = results_df.append(table, sort=True).reset_index(drop=True)
print ('Processed: %s' %x)
except:
print ('No table found: %s' %x)
no_table.append(x)
results_df = results_df[['date', 'type', 'registration', 'operator', 'fat.', 'location', 'cat']]