Итак, обходной путь, который я нашел, приведен ниже, я все еще получаю дубликаты идентификаторов игр в своем последнем наборе данных, но, по крайней мере, я перебираю весь желаемый набор и получаю все из них.Затем в конце я дедуплирую.
import requests, re, json
from bs4 import BeautifulSoup
import csv
import pandas as pd
years=['2015','2016','2017','2018']
weeks=['1','2','3','4','5','6','7','8','9','10','11','12','13','14']
data = pd.DataFrame(columns=['Teams','Link'])
all_links = {}
for year in years:
for i in weeks:
r = requests.get(r'https://www.espn.com/college-football/scoreboard/_/year/'+ year + '/seasontype/2/week/'+i)
soup = BeautifulSoup(r.text, 'html.parser')
scripts_head = soup.find('head').find_all('script')
for script in scripts_head:
if 'window.espn.scoreboardData' in script.text:
json_scoreboard = json.loads(re.search(r'({.*?});', script.text).group(1))
for event in json_scoreboard['events']:
name = event['name']
for link in event['links']:
if link['text'] == 'Gamecast':
gamecast = link['href']
all_links[name] = gamecast
#Save data to dataframe
data2=pd.DataFrame(list(all_links.items()),columns=['Teams','Link'])
#Append new data to existing data
data=data.append(data2,ignore_index = True)
#Save dataframe with all links to csv for future use
data_test=data.drop_duplicates(keep='first')
data_test.to_csv(r'all_years_deduped.csv')