Я пытаюсь очистить ссылки с сайта https://www.usyouthsoccer.org/clubs/club-directory/. Изначально код сломался на 30-й ссылке, поэтому я попытался обработать ошибку исключения с помощью urllib HTTPError.Теперь скрипт просто перестает работать на 30-й ссылке.Я проверил этот конкретный URL, и это плохая ссылка.Я просто хочу пройти мимо этого в цикле, но у меня проблемы с работой вокруг.Любые предложения будут с благодарностью ...
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
from urllib.request import Request, urlopen
from urllib.error import HTTPError
executable_path = {"executable_path": "chromedriver"}
browser = Browser("chrome", **executable_path, headless=True)
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
zipcode_input = 'CT_Main_0$txtLocation'
search_button = '//*[@id="CT_Main_0_btnSearch"]'
dropdown = '//*[@id="CT_Main_0_drpMiles"]/option[5]'
zip_codes = [64015]
team_df = pd.DataFrame()
for x in zip_codes:
try:
print(f'\n{x}\n')
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
browser.visit(url)
browser.fill(zipcode_input, x)
browser.find_by_xpath(dropdown).click()
browser.find_by_xpath(search_button).click()
html = browser.html
soup = bs(html, 'html.parser')
dallas_urls = soup.find_all(class_="more")
counter = 1
for url in dallas_urls:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
browser.visit(total_url)
my_html = pd.read_html(total_url)
details_pd = pd.DataFrame(my_html[0])
details_pd.columns = ['Cols', 'Vals']
df = details_pd.T
df.columns = df.iloc[0]
df.drop('Cols', inplace = True)
contacts_pd = pd.DataFrame(my_html[1])
if len(contacts_pd.index) == 1:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
elif len(contacts_pd.index) == 2:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
elif len(contacts_pd.index) == 3:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
df['Contact_Title3'] = contacts_pd.iloc[2,0]
df['Contact_Name3'] = contacts_pd.iloc[2, 1]
df['Contact_Email3'] = contacts_pd.iloc[2, 2]
team_df = pd.concat([team_df, df])
except HTTPError as err:
continue