Geckodriver : 0,24,0
Firefox : Я установил Buitron buildpack. Я не знаю, как получить версию. Я попытался выполнить Герою запустить Bash, а затем Firefox с --version
Python : 3.6.8
Селен : 3.141.0
Недавно я боролся с этим кодом очистки. Я пытаюсь сделать эффективный код очистки и развернуть его в Heroku, но в локальном режиме это всегда занимает слишком много времени, а в Heroku выдает одну ошибку за другой.
Прямо сейчас Ошибка, которую я хочу решить :
selenium.common.exceptions.WebDriverException: Message: invalid argument: can't kill an exited process
У меня также были другие ошибки, такие как:
Geckodriver executable needs to be in PATH
Или из предыдущего моего запроса (как вы можете видеть, я пытался и с Chrome).
Прочитав книгу Райана Митчелла о работе с Python и поиске, я все еще не нашел решения для этих ошибок.
Может кто-нибудь помочь мне решить эту ошибку "не может завершить процесс", пожалуйста? Или хотя бы предоставить какой-то ресурс.
Код ниже:
import os
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from import WebDriverWait
from import expected_conditions as EC
from import By
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# Goes on...
# Replace here with your browser paths and binaries !!!
def get_chromedriver_exec_path():
return 'Your_chromedriver_path_here'
def get_chrome_path():
return 'Your_chrome_path_here'
def get_firefox_exec_path():
return 'Your_firefox_geckodriver_executable_path_here'
def get_firefox_path():
return 'Your_firefox_browser_path'
def get_firefox_binpath():
return 'Your_firefox_binary_path'
def get_url(url, **kwargs):
Visits the url for getting the HTML response
:param url: the url to get the HTML response from
:param timeout: the max time to wait, (default = 7)
:return: the raw HTML response
timeout = kwargs.pop('timeout', MY_TIMEOUT)
return requests.get(url, timeout=timeout, **kwargs)
def user_agent_generator():
It returns a random user agent string
:return: an user agent
browsers = []
while True:
if not len(browsers):
browsers = user_agent_list[:]
yield browsers.pop()
except Exception as e:
logger.error('UA generator error', exc_info=e)
def proxy_generator():
It returns a generator that when called with next() or
a for structure, will return a new proxy
:return: a proxy generator
proxy_url = ''
types = ('elite', 'anonymous')
proxies = set()
while True:
if not len(proxies):
response = get_url(proxy_url)
b = BeautifulSoup(response.content, 'html.parser')
for bis in'table tbody tr'):
elements = bis.find_all('td')
# [:2] gets the two first columns with IP and PORT
ip_port = [t.text for t in elements[:2] if any(t in elements[TYPE_COLUMN_IDX].text for t in types) and 'yes' in elements[IS_HTTPS_COLUMN_IDX].text]
proxy = ':'.join(ip_port)
if proxy != '':
yield proxies.pop()
except Exception as e:
logger.error('Proxy generator error', exc_info=e)
class AlfredProxy(object):
proxy_gen = proxy_generator()
user_agent_gen = user_agent_generator()
def get_proxy():
Returns a new proxy
:return: a new proxy
return next(AlfredProxy.proxy_gen)
def get_user_agent():
Returns a new user agent
:return: a new user agent
return next(AlfredProxy.user_agent_gen)
def get_chrome_options():
Returns a ChromeOptions object with a new proxy
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get('GOOGLE_CHROME_SHIM', get_chrome_path()) # '/app/.apt/usr/bin/google-chrome'
return chrome_options
def get_firefox_options():
Returns a FirefoxOptions object with a new proxy
capabilities = webdriver.DesiredCapabilities().FIREFOX
#capabilities['marionette'] = False
options = Options()
options.headless = True
#options.binary_location = get_firefox_path()
os.environ['MOZ_HEADLESS'] = '1'
binary = FirefoxBinary(get_firefox_binpath())
return {
'capabilities': capabilities,
'options': options,
'firefox_binary': binary,
'executable_path': get_firefox_exec_path(),
def get_webdriver(firefox=True):
if firefox:
options = AlfredProxy.get_firefox_options()
user_agent = webdriver.Firefox(**options)
options = AlfredProxy.get_chrome_options()
user_agent = webdriver.Chrome(executable_path=get_chromedriver_exec_path(), options=options)
return user_agent
def get(url, attempts=0):
Gets the contents of an URL changing the headers of the user-agent
:param url: the url to visit
:param attempts: number of attempts to try
:return: the URL contents
if attempts > 10:
raise Exception()
proxies = {http_prefix: f'{http_prefix}://{proxy}'
for http_prefix, proxy in zip(['http', 'https'], [AlfredProxy.get_proxy()] * 2)}
user_agent = next(AlfredProxy.user_agent_gen)
headers = {'User-Agent': user_agent}
try:'UA: {user_agent}')
result = get_url(url, timeout=MY_TIMEOUT, headers=headers, proxies=proxies)
if result.status_code != 200:
raise ConnectionError()
return result
except ConnectionError as e:
return AlfredProxy.get(url, attempts + 1)
except Exception as e:
return AlfredProxy.get(url, attempts + 1)
def get_url_through_proxy(url, selector=None, num_attempts=0, **kwargs):
Visits the url for getting the HTML response,
uses a proxy anc changes user-agent if necessary
:param url: the url
:return: the raw HTML response
if num_attempts == 5:
raise ConnectionAttemptsError(msg=['Demasiados intentos a través de proxy'])
user_agent_driver = AlfredProxy.get_webdriver()
if selector:
wait = WebDriverWait(user_agent_driver, MY_TIMEOUT)
condition = EC.presence_of_element_located((By.CSS_SELECTOR, selector))
element = wait.until(condition)
return user_agent_driver.page_source
except TimeoutException:
logger.error('Timeout error')
return get_url_through_proxy(url, selector, num_attempts=num_attempts+1)
except Exception as e:
logger.error(f"Proxy error: {type(e)}")
return get_url_through_proxy(url, selector, num_attempts=num_attempts + 1)
# Some test code
if __name__ == '__main__':
get_url_through_proxy('', '#centerCol')