Python: анализировать несколько таблиц с веб-страницы и группировать данные в CSV - PullRequest
0 голосов
/ 30 мая 2020

Я совершенно новичок ie в Python и имею, как мне кажется, довольно сложную проблему. Я хотел бы проанализировать две таблицы с веб-сайта примерно на 80 URL-адресов, например, одной из страниц: https://www.sports-reference.com/cfb/players/sam-darnold-1.html

Мне нужна первая таблица «Проходит» и вторая таблица «Спешка и получение» для каждого из 80 URL-адресов (я знаю, как получить первую и вторую таблицу). Но проблема в том, что он мне нужен для всех 80 URL в одном CSV.

Это мой код и то, как выглядят данные:

import requests
import pandas as pd

COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']

urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
        'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
        'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements

dataframes = []
try:
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        #print(soup)
        table = soup.find_all('table')[0] # Find the first "table" tag in the page
        rows = table.find_all("tr") 
        cy_data = []
        for row in rows:
            cells = row.find_all("td")
            cells = cells[0:14] 
            cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it

        dataframes.append(pd.DataFrame(cy_data, columns=COLUMNS).drop(0, axis=0))
except:
    pass

data = pd.concat(dataframes)
data.to_csv('testcsv3.csv', sep=',') ```


+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
|   |  |        School        |  Conf   | Class | Pos | G  | Cmp | Att  | Pct  |  Yds  | Y/A  | AY/A | TD  | Int | Rate  |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+
| 1 |  |                      |         |       |     |    |     |      |      |       |      |      |     |     |       |
| 2 |  | North Carolina State | ACC     | FR    | QB  | 11 | 150 |  275 | 54.5 |  1955 |  7.1 |  8.2 |  17 |   1 | 133.9 |
| 3 |  | North Carolina State | ACC     | SO    | QB  | 12 | 224 |  378 | 59.3 |  3027 |    8 |  8.3 |  31 |  11 | 147.8 |
| 4 |  | North Carolina State | ACC     | JR    | QB  | 13 | 308 |  527 | 58.4 |  3563 |  6.8 |  6.6 |  28 |  14 | 127.5 |
| 5 |  | Wisconsin            | Big Ten | SR    | QB  | 14 | 225 |  309 | 72.8 |  3175 | 10.3 | 11.8 |  33 |   4 | 191.8 |
| 6 |  | Overall              |         |       |     |    | 907 | 1489 | 60.9 | 11720 |  7.9 |  8.4 | 109 |  30 | 147.2 |
| 7 |  | North Carolina State |         |       |     |    | 682 | 1180 | 57.8 |  8545 |  7.2 |  7.5 |  76 |  26 | 135.5 |
| 8 |  | Wisconsin            |         |       |     |    | 225 |  309 | 72.8 |  3175 | 10.3 | 11.8 |  33 |   4 | 191.8 |
| 1 |  |                      |         |       |     |    |     |      |      |       |      |      |     |     |       |
| 2 |  | Florida              | SEC     | FR    | QB  |  5 |   5 |   10 |   50 |    40 |    4 |    4 |   0 |   0 |  83.6 |
| 3 |  | Florida              | SEC     | SO    | QB  |  1 |   1 |    2 |   50 |    14 |    7 |    7 |   0 |   0 | 108.8 |
| 4 |  | Auburn               | SEC     | JR    | QB  | 14 | 185 |  280 | 66.1 |  2854 | 10.2 | 11.2 |  30 |   7 |   182 |
| 5 |  | Overall              |         |       |     |    | 191 |  292 | 65.4 |  2908 |   10 | 10.9 |  30 |   7 | 178.2 |
| 6 |  | Florida              |         |       |     |    |   6 |   12 |   50 |    54 |  4.5 |  4.5 |   0 |   0 |  87.8 |
| 7 |  | Auburn               |         |       |     |    | 185 |  280 | 66.1 |  2854 | 10.2 | 11.2 |  30 |   7 |   182 |
+---+--+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+


And this is how I'd like the data to look, note the player name is missing from each grouping which ideally can be added from the sample website/url and I've added the second table which I need help appending:


+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
|   |                |        School        |  Conf   | Class | Pos | G  | Cmp | Att  | Pct  |  Yds  | Y/A  | AY/A | TD  | Int | Rate  |        School        |  Conf   | Class | Pos | G  | Att | Yds | Avg | TD |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+
| 1 |                |                      |         |       |     |    |     |      |      |       |      |      |     |     |       |                      |         |       |     |    |     |     |     |    |
| 2 | Russell Wilson | North Carolina State | ACC     | FR    | QB  | 11 | 150 |  275 | 54.5 |  1955 |  7.1 |  8.2 |  17 |   1 | 133.9 | North Carolina State | ACC     | FR    | QB  | 11 | 150 | 467 | 6.7 |  3 |
| 3 | Russell Wilson | North Carolina State | ACC     | SO    | QB  | 12 | 224 |  378 | 59.3 |  3027 |    8 |  8.3 |  31 |  11 | 147.8 | North Carolina State | ACC     | SO    | QB  | 12 | 129 | 300 | 6.8 |  2 |
| 4 | Russell Wilson | North Carolina State | ACC     | JR    | QB  | 13 | 308 |  527 | 58.4 |  3563 |  6.8 |  6.6 |  28 |  14 | 127.5 | North Carolina State | ACC     | JR    | QB  | 13 | 190 | 560 | 7.1 |  5 |
| 5 | Russell Wilson | Wisconsin            | Big Ten | SR    | QB  | 14 | 225 |  309 | 72.8 |  3175 | 10.3 | 11.8 |  33 |   4 | 191.8 | Wisconsin            | Big Ten | SR    | QB  | 14 | 210 | 671 | 7.3 |  7 |
| 6 | Russell Wilson | Overall              |         |       |     |    | 907 | 1489 | 60.9 | 11720 |  7.9 |  8.4 | 109 |  30 | 147.2 | Overall              |         |       |     |    |     |     |     |    |
| 7 | Russell Wilson | North Carolina State |         |       |     |    | 682 | 1180 | 57.8 |  8545 |  7.2 |  7.5 |  76 |  26 | 135.5 | North Carolina State |         |       |     |    |     |     |     |    |
| 8 | Russell Wilson | Wisconsin            |         |       |     |    | 225 |  309 | 72.8 |  3175 | 10.3 | 11.8 |  33 |   4 | 191.8 | Wisconsin            |         |       |     |    |     |     |     |    |
| 1 |                |                      |         |       |     |    |     |      |      |       |      |      |     |     |       |                      |         |       |     |    |     |     |     |    |
| 2 | Cam Newton     | Florida              | SEC     | FR    | QB  |  5 |   5 |   10 |   50 |    40 |    4 |    4 |   0 |   0 |  83.6 | Florida              | SEC     | FR    | QB  |  5 | 210 | 456 | 7.1 |  2 |
| 3 | Cam Newton     | Florida              | SEC     | SO    | QB  |  1 |   1 |    2 |   50 |    14 |    7 |    7 |   0 |   0 | 108.8 | Florida              | SEC     | SO    | QB  |  1 | 212 | 478 | 4.5 |  5 |
| 4 | Cam Newton     | Auburn               | SEC     | JR    | QB  | 14 | 185 |  280 | 66.1 |  2854 | 10.2 | 11.2 |  30 |   7 |   182 | Auburn               | SEC     | JR    | QB  | 14 | 219 | 481 | 6.7 |  6 |
| 5 | Cam Newton     | Overall              |         |       |     |    | 191 |  292 | 65.4 |  2908 |   10 | 10.9 |  30 |   7 | 178.2 | Overall              |         |       |     |    |     |     | 3.4 |  7 |
| 6 | Cam Newton     | Florida              |         |       |     |    |   6 |   12 |   50 |    54 |  4.5 |  4.5 |   0 |   0 |  87.8 | Florida              |         |       |     |    |     |     |     |    |
| 7 | Cam Newton     | Auburn               |         |       |     |    | 185 |  280 | 66.1 |  2854 | 10.2 | 11.2 |  30 |   7 |   182 | Auburn               |         |       |     |    |     |     |     |    |
+---+----------------+----------------------+---------+-------+-----+----+-----+------+------+-------+------+------+-----+-----+-------+----------------------+---------+-------+-----+----+-----+-----+-----+----+

So basically I'd wanna append the second table (Only the columns mentioned) to the end of the first table and add the player name (read from the URL) to each row




1 Ответ

1 голос
/ 30 мая 2020
import requests
import pandas as pd
from bs4 import BeautifulSoup

COLUMNS = ['School', 'Conf', 'Class', 'Pos', 'G', 'Cmp', 'Att', 'Pct', 'Yds','Y/A', 'AY/A', 'TD', 'Int', 'Rate']
COLUMNS2 = ['School', 'Conf', 'Class', 'Pos', 'G', 'Att', 'Yds','Avg', 'TD', 'Rec', 'Yds', 'Avg', 'TD', 'Plays', 'Yds', 'Avg', 'TD']

urls = ['https://www.sports-reference.com/cfb/players/russell-wilson-1.html',
        'https://www.sports-reference.com/cfb/players/cam-newton-1.html',
        'https://www.sports-reference.com/cfb/players/peyton-manning-1.html']
#scrape elements

dataframes = []
dataframes2 = []
for url in urls:
    a = url
    print(a)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    #print(soup)
    table = soup.find_all('table')[0] # Find the first "table" tag in the page
    rows = table.find_all("tr") 
    cy_data = []
    for row in rows:
        cells = row.find_all("td")
        cells = cells[0:14] 
        cy_data.append([cell.text for cell in cells]) # For each "td" tag, get the text inside it
    cy_data = pd.DataFrame(cy_data, columns=COLUMNS)

    #Create player column in first column and derive the player from the URL
    cy_data.insert(0, 'Player', url)
    cy_data['Player'] = cy_data['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
    dataframes.append(cy_data)

    table2 = soup.find_all('table')[1] # Find the second "table" tag in the page
    rows2 = table2.find_all("tr") 
    cy_data2 = []
    for row2 in rows2:
        cells2 = row2.find_all("td")
        cells2 = cells2[0:14] 
        cy_data2.append([cell.text for cell in cells2]) # For each "td" tag, get the text inside it
    cy_data2 = pd.DataFrame(cy_data2, columns=COLUMNS2)
    cy_data2.insert(0, 'Player', url)
    cy_data2['Player'] = cy_data2['Player'].str.split('/').str[5].str.split('-').str[0].str.title() + ' ' + cy_data2['Player'].str.split('/').str[5].str.split('-').str[1].str.title()
    dataframes2.append(cy_data2)

data = pd.concat(dataframes).reset_index()
data2 = pd.concat(dataframes).reset_index()
data3 = data.merge(data2, on=['index', 'Player'], suffixes=('',' '))
#Filter on None rows
data3 = data3.loc[data3['School'].notnull()].drop('index', axis=1)
display(data, data2, data3)
...