import requests
import pandas as pd
from bs4 import BeautifulSoup
COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
COLUMNS2 = ['School', 'Conf', 'Class', 'Pos', 'G', 'Att', 'Yds','Avg', 'TD', 'Rec', 'Yds', 'Avg', 'TD', 'Plays', 'Yds', 'Avg', 'TD']
urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements
dataframes = []
dataframes2 = []
for url in urls:
a = url
print(a)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print(soup)
table = soup.find_all('table')[0] # Find the first "table" tag in the page
rows = table.find_all("tr")
cy_data = []
for row in rows:
cells = row.find_all("td")
cells = cells[0:14]
cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
cy_data = pd.DataFrame(cy_data, columns=COLUMNS)
#Create player column in first column and derive the player from the URL
cy_data.insert(0, 'Player', url)
cy_data['Player'] = cy_data['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes.append(cy_data)
table2 = soup.find_all('table')[1] # Find the second "table" tag in the page
rows2 = table2.find_all("tr")
cy_data2 = []
for row2 in rows2:
cells2 = row2.find_all("td")
cells2 = cells2[0:14]
cy_data2.append([cell.text for cell in cells2]) # For each "td" tag, get the text inside it
cy_data2 = pd.DataFrame(cy_data2, columns=COLUMNS2)
cy_data2.insert(0, 'Player', url)
cy_data2['Player'] = cy_data2['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data2['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
dataframes2.append(cy_data2)
data = pd.concat(dataframes).reset_index()
data2 = pd.concat(dataframes).reset_index()
data3 = data.merge(data2, on=['index', 'Player'], suffixes=('',' '))
#Filter on None rows
data3 = data3.loc[data3['School'].notnull()].drop('index', axis=1)
display(data, data2, data3)