Кто-нибудь знает, что происходит? Я собираюсь выбросить свой компьютер в окно. Красивый суп отлично работает с той же страницей, поэтому я знаю, что это не связь. И я пробовал помещать WebDriverWait
и time.sleep(10)
между каждой строкой - после задержки возникает такая же ошибка.
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
chromedriver = "/usr/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
i = 2
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1')
wait = WebDriverWait(driver, 10)
wait
while i < 269:
wait
driver.find_element_by_xpath('//*[@id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
wait
get_movie_dict(driver.current_url)
wait
i += 1
wait
driver.back()
Я получаю следующие ошибки:
ConnectionRefusedError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
156 conn = connection.create_connection(
--> 157 (self._dns_host, self.port), self.timeout, **extra_kw
158 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
83 if err is not None:
---> 84 raise err
85
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
671 headers=headers,
--> 672 chunked=chunked,
673 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
375 try:
--> 376 self._validate_conn(conn)
377 except (SocketTimeout, BaseSSLError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
993 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 994 conn.connect()
995
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
299 # Add certificate verification
--> 300 conn = self._new_conn()
301 hostname = self.host
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
168 raise NewConnectionError(
--> 169 self, "Failed to establish a new connection: %s" % e
170 )
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
435 if new_retry.is_exhausted():
--> 436 raise MaxRetryError(_pool, url, error or ResponseError(cause))
437
MaxRetryError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-55-0dd26218976b> in <module>
9 driver.find_element_by_xpath('//*[@id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
10 wait
---> 11 get_movie_dict(driver.current_url)
12 wait
13 i += 1
<ipython-input-45-2533561becb9> in get_movie_dict(link)
19 wait = WebDriverWait(driver, 10)
20 wait
---> 21 response = requests.get(url)
22 wait
23 page = response.text
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
Edit: Добавлена функция get_movie_dict
:
def get_movie_dict(link):
'''
From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
collect
- title
- domestic gross
- runtime
- MPAA rating
- full release date
Return information as a dictionary.
'''
base_url = 'https://www.boxofficemojo.com'
#Create full url to scrape
url = base_url + link
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
headers = ['movie_title', 'domestic_total_gross',
'runtime_minutes', 'rating', 'release_date', 'budget']
#Get title
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()
#Get domestic gross
try:
raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
.find_all('span', class_='money')[0]
.text
)
except:
raw_domestic_total_gross = float("NaN")
if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
print('This is NaN')
domestic_total_gross = float("NaN")
else:
domestic_total_gross = money_to_int(raw_domestic_total_gross)
#Get runtime
raw_runtime = get_movie_value(soup,'Running')
if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
runtime = runtime_to_minutes(raw_runtime)
#Get rating
rating = get_movie_value(soup,'MPAA')
#Get release date
if '-' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
elif '(' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
else:
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
release_date = to_date(raw_release_date)
# Get budget alt
raw_budget = get_movie_value(soup,'Budget')
budget = money_to_int(raw_budget)
#Create movie dictionary and return
movie_dict = dict(zip(headers,[title,
domestic_total_gross,
runtime,
rating,
release_date,
budget]))
return movie_dict