Geckodriver : 0,24,0
Firefox : Я установил Buitron buildpack. Я не знаю, как получить версию. Я попытался выполнить Герою запустить Bash, а затем Firefox с --version
Python : 3.6.8
Селен : 3.141.0
Недавно я боролся с этим кодом очистки. Я пытаюсь сделать эффективный код очистки и развернуть его в Heroku, но в локальном режиме это всегда занимает слишком много времени, а в Heroku выдает одну ошибку за другой.
Прямо сейчас Ошибка, которую я хочу решить :
selenium.common.exceptions.WebDriverException: Message: invalid argument: can't kill an exited process
У меня также были другие ошибки, такие как:
Geckodriver executable needs to be in PATH
Или из предыдущего моего запроса (как вы можете видеть, я пытался и с Chrome).
Прочитав книгу Райана Митчелла о работе с Python и поиске, я все еще не нашел решения для этих ошибок.
Может кто-нибудь помочь мне решить эту ошибку "не может завершить процесс", пожалуйста? Или хотя бы предоставить какой-то ресурс.
Код ниже:
import os
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
TYPE_COLUMN_IDX = 4
IS_HTTPS_COLUMN_IDX = 6
MY_TIMEOUT = 4.5
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# Goes on...
]
# Replace here with your browser paths and binaries !!!
def get_chromedriver_exec_path():
return 'Your_chromedriver_path_here'
def get_chrome_path():
return 'Your_chrome_path_here'
def get_firefox_exec_path():
return 'Your_firefox_geckodriver_executable_path_here'
def get_firefox_path():
return 'Your_firefox_browser_path'
def get_firefox_binpath():
return 'Your_firefox_binary_path'
def get_url(url, **kwargs):
"""
Visits the url for getting the HTML response
:param url: the url to get the HTML response from
:param timeout: the max time to wait, (default = 7)
:return: the raw HTML response
"""
timeout = kwargs.pop('timeout', MY_TIMEOUT)
return requests.get(url, timeout=timeout, **kwargs)
def user_agent_generator():
"""
It returns a random user agent string
:return: an user agent
"""
browsers = []
while True:
try:
if not len(browsers):
browsers = user_agent_list[:]
random.shuffle(browsers)
yield browsers.pop()
except Exception as e:
logger.error('UA generator error', exc_info=e)
def proxy_generator():
"""
It returns a generator that when called with next() or
a for structure, will return a new proxy
:return: a proxy generator
"""
proxy_url = 'https://free-proxy-list.net/anonymous-proxy.html'
types = ('elite', 'anonymous')
proxies = set()
while True:
try:
if not len(proxies):
response = get_url(proxy_url)
b = BeautifulSoup(response.content, 'html.parser')
for bis in b.select('table tbody tr'):
elements = bis.find_all('td')
# [:2] gets the two first columns with IP and PORT
ip_port = [t.text for t in elements[:2] if any(t in elements[TYPE_COLUMN_IDX].text for t in types) and 'yes' in elements[IS_HTTPS_COLUMN_IDX].text]
proxy = ':'.join(ip_port)
if proxy != '':
proxies.add(proxy)
yield proxies.pop()
except Exception as e:
logger.error('Proxy generator error', exc_info=e)
class AlfredProxy(object):
proxy_gen = proxy_generator()
user_agent_gen = user_agent_generator()
@staticmethod
def get_proxy():
"""
Returns a new proxy
:return: a new proxy
"""
return next(AlfredProxy.proxy_gen)
@staticmethod
def get_user_agent():
"""
Returns a new user agent
:return: a new user agent
"""
return next(AlfredProxy.user_agent_gen)
@staticmethod
def get_chrome_options():
"""
Returns a ChromeOptions object with a new proxy
:return:
"""
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get('GOOGLE_CHROME_SHIM', get_chrome_path()) # '/app/.apt/usr/bin/google-chrome'
chrome_options.add_argument(f'--user-agent={AlfredProxy.get_user_agent()}')
chrome_options.add_argument(f'--proxy-server={AlfredProxy.get_proxy()}')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('headless')
return chrome_options
@staticmethod
def get_firefox_options():
"""
Returns a FirefoxOptions object with a new proxy
:return:
"""
capabilities = webdriver.DesiredCapabilities().FIREFOX
#capabilities['marionette'] = False
options = Options()
options.headless = True
#options.binary_location = get_firefox_path()
os.environ['MOZ_HEADLESS'] = '1'
binary = FirefoxBinary(get_firefox_binpath())
return {
'capabilities': capabilities,
'options': options,
'firefox_binary': binary,
'executable_path': get_firefox_exec_path(),
}
@staticmethod
def get_webdriver(firefox=True):
if firefox:
options = AlfredProxy.get_firefox_options()
user_agent = webdriver.Firefox(**options)
else:
options = AlfredProxy.get_chrome_options()
user_agent = webdriver.Chrome(executable_path=get_chromedriver_exec_path(), options=options)
return user_agent
@staticmethod
def get(url, attempts=0):
"""
Gets the contents of an URL changing the headers of the user-agent
:param url: the url to visit
:param attempts: number of attempts to try
:return: the URL contents
"""
if attempts > 10:
raise Exception()
proxies = {http_prefix: f'{http_prefix}://{proxy}'
for http_prefix, proxy in zip(['http', 'https'], [AlfredProxy.get_proxy()] * 2)}
user_agent = next(AlfredProxy.user_agent_gen)
headers = {'User-Agent': user_agent}
try:
logger.info(f'UA: {user_agent}')
result = get_url(url, timeout=MY_TIMEOUT, headers=headers, proxies=proxies)
if result.status_code != 200:
raise ConnectionError()
return result
except ConnectionError as e:
return AlfredProxy.get(url, attempts + 1)
except Exception as e:
return AlfredProxy.get(url, attempts + 1)
def get_url_through_proxy(url, selector=None, num_attempts=0, **kwargs):
"""
Visits the url for getting the HTML response,
uses a proxy anc changes user-agent if necessary
:param url: the url
:return: the raw HTML response
"""
if num_attempts == 5:
raise ConnectionAttemptsError(msg=['Demasiados intentos a través de proxy'])
user_agent_driver = AlfredProxy.get_webdriver()
try:
if selector:
wait = WebDriverWait(user_agent_driver, MY_TIMEOUT)
user_agent_driver.get(url)
condition = EC.presence_of_element_located((By.CSS_SELECTOR, selector))
element = wait.until(condition)
else:
user_agent_driver.implicitly_wait(MY_TIMEOUT)
user_agent_driver.get(url)
return user_agent_driver.page_source
except TimeoutException:
logger.error('Timeout error')
return get_url_through_proxy(url, selector, num_attempts=num_attempts+1)
except Exception as e:
logger.error(f"Proxy error: {type(e)}")
return get_url_through_proxy(url, selector, num_attempts=num_attempts + 1)
# Some test code
if __name__ == '__main__':
get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f', '#centerCol')
print('-'*40)
get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f')