Сложность в том, что он использует команду cook ie с Google Analytics, а запросы не получают их для использования в заголовках. Однако вы можете получить эти куки, войдя в систему с помощью Selenium. Получив таким образом файлы cookie, вы сможете использовать их с модулем requests
для go просмотра страниц.
Я не совсем понял, как обходить всплывающую рекламу (поэтому иногда это будет работать, иногда вам нужно будет попытаться запустить его снова), но, похоже, как только вы пройдете первоначальный вход в систему, это сработает. Поскольку для каждого игрока требуется ссылка go, просмотр общего списка из 375 игроков за 2010-2020 гг. Займет около 2-3 минут:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome("C:/chromedriver_win32/chromedriver.exe")
# Use Selenium to login and get all cookies
loginURL = 'https://www.spotrac.com/signin/'
username = 'xxxxxx'
password = 'xxxxxx'
driver.get(loginURL)
try:
# Wait for cookie message
accept_cookie = WebDriverWait(driver, 5, 0.25).until(EC.visibility_of_element_located([By.CSS_SELECTOR, '.cookie-alert-accept']))
accept_cookie.click()
print("Cookies accepted")
except TimeoutException:
print("no alert")
try:
# Wait for cookie message
popup = WebDriverWait(driver, 5, 0.25).until(EC.visibility_of_element_located([By.CSS_SELECTOR, '.cls-btn']))
popup.click()
except TimeoutException:
print("timed out")
time.sleep(5)
driver.find_element_by_name("email").send_keys(username)
driver.find_element_by_name("password").send_keys(password)
submit = WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="contactForm"]/div[2]/input')))
submit.click()
print ('Logged in!')
# Now that the cookies are there, can use requests to iterate through the links
for seas in range(2020, 2009, -1):
print(seas)
url = 'https://www.spotrac.com/nba/contracts/breakdown/%s/' %seas
driver.get(url)
playerDict = {}
soup = BeautifulSoup(driver.page_source, 'html.parser')
players = soup.find_all('td',{'class':'player'})
for player in players:
name = player.find('a').text
link = player.find('a')['href']
if name not in playerDict.keys():
playerDict[name] = link
results = pd.DataFrame()
count = 1
for player, link in playerDict.items():
driver.get(link)
dfs = pd.read_html(driver.page_source)
df = pd.DataFrame()
for i, table in enumerate(dfs):
if len(table.columns) == 2 and len(table) == 5:
idx = i
temp_df = table.T
temp_df.columns = temp_df.iloc[0]
temp_df = temp_df.rename(columns={'Average Salary:':'Avg. Salary:','Avg Salary:':'Avg. Salary:'})
try:
seasonContract = dfs[idx-2].iloc[0,0]
year = re.findall(r"\d\d\d\d-\d\d\d\d",seasonContract)[0]
seasonContract = year + ' ' + re.split(year, seasonContract)[-1]
except:
seasonContract = 'Current Contract'
temp_df['Player'] = player
temp_df['Contract Years'] = seasonContract
df = df.append(temp_df.iloc[1:], sort=False).reset_index(drop=True)
results = results.append(df,sort=False).reset_index(drop=True)
print ('%03d of %d - %s data aquired...' %(count, len(playerDict), player))
count += 1
driver.close()
Вывод :
print (results.head(25).to_string())
0 Contract: Signing Bonus: Avg. Salary: Signed Using: Free Agent: Player Contract Years
0 2 yr(s) / $48,500,000 - $24,250,000 Bird 2016 / UFA Kobe Bryant 2014-2015
1 3 yr(s) / $83,547,447 - $27,849,149 Bird 0 / Kobe Bryant 2011-2013
2 7 yr(s) / $136,434,375 - $19,490,625 NaN 2011 / UFA Kobe Bryant 2004-2010
3 5 yr(s) / $56,255,000 - $11,251,000 NaN 2004 / UFA Kobe Bryant 1999-2003
4 3 yr(s) / $3,501,240 - $1,167,080 NaN 0 / Kobe Bryant 1996-1998 Entry Level
5 2 yr(s) / $2,751,688 - $1,375,844 Minimum 2014 / UFA Rashard Lewis 2012-2013
6 1 yr(s) / $13,765,000 - $13,765,000 NaN 0 / Rashard Lewis 2012-2012
7 6 yr(s) / $118,200,000 - $19,700,000 NaN 2013 / UFA Rashard Lewis 2007-2012
8 4 yr(s) / $32,727,273 - $8,181,818 NaN 2007 / UFA Rashard Lewis 2003-2006
9 3 yr(s) / $14,567,141 - $4,855,714 NaN 2003 / UFA Rashard Lewis 2000-2002
10 2 yr(s) / $672,500 - $336,250 NaN 2000 / RFA Rashard Lewis 1998-1999 Entry Level
11 2 yr(s) / $10,850,000 - $5,425,000 Bird 2017 / UFA Tim Duncan 2015-2016
12 3 yr(s) / $30,361,446 - $10,120,482 Bird 2015 / UFA Tim Duncan 2012-2014
13 4 yr(s) / $40,000,000 - $10,000,000 NaN 2012 / UFA Tim Duncan 2010-2011
14 7 yr(s) / $122,007,706 - $17,429,672 NaN 2010 / UFA Tim Duncan 2003-2009
15 3 yr(s) / $31,902,500 - $10,634,167 NaN 2003 / UFA Tim Duncan 2000-2002
16 3 yr(s) / $10,239,080 - $3,413,027 NaN 2000 / UFA Tim Duncan 1997-1999 Entry Level
17 2 yr(s) / $16,500,000 - $8,250,000 Bird 2017 / UFA Kevin Garnett 2015-2016
18 3 yr(s) / $36,000,000 - $12,000,000 Bird 2015 / UFA Kevin Garnett 2012-2014
19 3 yr(s) / $51,300,000 - $17,100,000 NaN 2012 / UFA Kevin Garnett 2009-2011
20 5 yr(s) / $100,000,000 - $20,000,000 NaN 2009 / UFA Kevin Garnett 2004-2008
21 6 yr(s) / $126,016,300 - $21,002,717 NaN 0 / Kevin Garnett 1998-2003
22 3 yr(s) / $5,397,120 - $1,799,040 Rookie 0 / Kevin Garnett 1995-1997 Entry Level
23 1 yr(s) / $1,308,506 - $1,308,506 NaN 2012 / UFA Michael Redd Current Contract
24 6 yr(s) / $90,100,000 - $15,016,667 NaN 2011 / UFA Michael Redd 2005-2010
....