Сократите время очистки с помощью селена и параллельных фьючерсов - PullRequest
0 голосов
/ 11 июля 2020

Я новичок в параллельных фьючерсах и мне действительно нужна помощь в правильной настройке. Я пытаюсь очистить страницы бейсбольной статистики и хотел бы ускорить процесс. У меня есть код, который работает без ошибок; однако кажется, что работает только один экземпляр, и он не совсем использует многопоточность, поскольку время обработки вообще не изменилось с обычного парсинга на многопоточность. Кроме того, я также вижу, что открыт только один браузер (не уверен, что это должно измениться или как это работает в фоновом режиме). Любая помощь будет принята с благодарностью!

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import concurrent.futures

MAX_THREADS=30
t0 = time.time()
browser = webdriver.Chrome()

player_urls = ['http://www.fangraphs.com/statsplits.aspx?playerid=5677&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5677&position=C&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=7077&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=7077&position=C&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=9256&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=9256&position=OF&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=11270&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=11270&position=OF&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=4087&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=4087&position=OF&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=5297&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5297&position=OF&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=15640&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15640&position=OF&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=15496&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15496&position=SS&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=19844&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=19844&position=3B&season=0&split=0.5',
            'http://www.fangraphs.com/statsplits.aspx?playerid=13769&position=2B/SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=13769&position=2B/SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=10950&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=10950&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=15082&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15082&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=19878&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=19878&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=1904&position=1B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=1904&position=1B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=8027&position=1B/DH&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=8027&position=1B/DH&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=9362&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=9362&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=9682&position=2B/3B/SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=9682&position=2B/3B/SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=14106&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=14106&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=19287&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=19287&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=639&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=639&position=3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=10294&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=10294&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=11338&position=2B/3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=11338&position=2B/3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=16246&position=1B/DH&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=16246&position=1B/DH&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=14109&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=14109&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=3336&position=2B/3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=3336&position=2B/3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=7125&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=7125&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=6310&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=6310&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=15937&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15937&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=3371&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=3371&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=7476&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=7476&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=17678&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=17678&position=3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=7223&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=7223&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=12649&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=12649&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=15670&position=3B/OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15670&position=3B/OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=17276&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=17276&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=5411&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5411&position=3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=13877&position=2B/3B/SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=13877&position=2B/3B/SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=5305&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5305&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=2090&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=2090&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=17027&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=17027&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=5133&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5133&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=5248&position=2B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5248&position=2B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=14267&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=14267&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=8848&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=8848&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=15722&position=SS/OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15722&position=SS/OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=15518&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=15518&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=6265&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=6265&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=10847&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=10847&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=1488&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=1488&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=17901&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=17901&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=14942&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=14942&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=5386&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=5386&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=1159&position=3B/SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=1159&position=3B/SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=13132&position=C&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=13132&position=C&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=13414&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=13414&position=OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=14196&position=2B/3B/OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=14196&position=2B/3B/OF&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=12779&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=12779&position=3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=6444&position=3B&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=6444&position=3B&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=3972&position=SS&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=3972&position=SS&season=0&split=0.5', 'http://www.fangraphs.com/statsplits.aspx?playerid=2918&position=OF&season=0&split=0.6', 'http://www.fangraphs.com/statsplits.aspx?playerid=2918&position=OF&season=0&split=0.5']

def get_stats(url):
    browser.get(url)

    standard = browser.find_element_by_css_selector("#standard").get_attribute('outerHTML')
    soup = BeautifulSoup(standard, 'html.parser')
    dfStandard = pd.read_html(str(soup))[0]

    advanced = browser.find_element_by_css_selector("#advanced").get_attribute('outerHTML')
    soup = BeautifulSoup(advanced, 'html.parser')
    dfAdvanced = pd.read_html(str(soup))[0]

    battedBall = browser.find_element_by_css_selector("#batted-ball").get_attribute('outerHTML')
    soup = BeautifulSoup(battedBall, 'html.parser')
    dfbattedBall = pd.read_html(str(soup))[0]

    dfinitial = pd.merge(dfStandard, dfAdvanced, on=['Season', 'Handedness'], how='inner')
    final = pd.merge(dfinitial, dfbattedBall, on=['Season', 'Handedness'], how='inner')
    final.drop(['AVG_y'], axis=1, inplace=True)
    
def get_pages(urls):
    threads = min(MAX_THREADS, len(player_urls))

    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(get_stats, urls)

def main(urls):
    t0 = time.time()
    get_pages(player_urls)
    t1 = time.time()
    print(f"{t1-t0} seconds to download data.")

main(player_urls)

t1 = time.time()
print(t1-t0)
browser.quit()
...