Question

Geckodriver : 0,24,0

Firefox : Я установил Buitron buildpack. Я не знаю, как получить версию. Я попытался выполнить Герою запустить Bash, а затем Firefox с --version

Python : 3.6.8

Селен : 3.141.0

Недавно я боролся с этим кодом очистки. Я пытаюсь сделать эффективный код очистки и развернуть его в Heroku, но в локальном режиме это всегда занимает слишком много времени, а в Heroku выдает одну ошибку за другой.

Прямо сейчас Ошибка, которую я хочу решить :

selenium.common.exceptions.WebDriverException: Message: invalid argument: can't kill an exited process

У меня также были другие ошибки, такие как:

Geckodriver executable needs to be in PATH

Или из предыдущего моего запроса (как вы можете видеть, я пытался и с Chrome).

Прочитав книгу Райана Митчелла о работе с Python и поиске, я все еще не нашел решения для этих ошибок.

Может кто-нибудь помочь мне решить эту ошибку "не может завершить процесс", пожалуйста? Или хотя бы предоставить какой-то ресурс.

Код ниже:

import os
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
    

TYPE_COLUMN_IDX = 4
IS_HTTPS_COLUMN_IDX = 6
MY_TIMEOUT = 4.5


user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# Goes on...
]


# Replace here with your browser paths and binaries !!!

def get_chromedriver_exec_path():
    return 'Your_chromedriver_path_here'

def get_chrome_path():
    return 'Your_chrome_path_here'

def get_firefox_exec_path():
    return 'Your_firefox_geckodriver_executable_path_here'

def get_firefox_path():
    return 'Your_firefox_browser_path'

def get_firefox_binpath():
    return 'Your_firefox_binary_path'


def get_url(url, **kwargs):
    """
    Visits the url for getting the HTML response
    :param url: the url to get the HTML response from
    :param timeout: the max time to wait, (default = 7)
    :return: the raw HTML response
    """
    timeout = kwargs.pop('timeout', MY_TIMEOUT)
    return requests.get(url, timeout=timeout, **kwargs)


def user_agent_generator():
    """
    It returns a random user agent string
    :return: an user agent
    """
    browsers = []
    while True:
        try:
            if not len(browsers):
                browsers = user_agent_list[:]
                random.shuffle(browsers)
            yield browsers.pop()
        except Exception as e:
            logger.error('UA generator error', exc_info=e)


def proxy_generator():
    """
    It returns a generator that when called with next() or
    a for structure, will return a new proxy
    :return: a proxy generator
    """
    proxy_url = 'https://free-proxy-list.net/anonymous-proxy.html'
    types = ('elite', 'anonymous')
    proxies = set()

    while True:
        try:
            if not len(proxies):
                response = get_url(proxy_url)
                b = BeautifulSoup(response.content, 'html.parser')
                for bis in b.select('table tbody tr'):
                    elements = bis.find_all('td')
                    # [:2] gets the two first columns with IP and PORT
                    ip_port = [t.text for t in elements[:2] if any(t in elements[TYPE_COLUMN_IDX].text for t in types) and 'yes' in elements[IS_HTTPS_COLUMN_IDX].text]
                    proxy = ':'.join(ip_port)
                    if proxy != '':
                        proxies.add(proxy)
            yield proxies.pop()
        except Exception as e:
            logger.error('Proxy generator error', exc_info=e)


class AlfredProxy(object):
    proxy_gen = proxy_generator()
    user_agent_gen = user_agent_generator()

    @staticmethod
    def get_proxy():
        """
        Returns a new proxy
        :return: a new proxy
        """
        return next(AlfredProxy.proxy_gen)

    @staticmethod
    def get_user_agent():
        """
        Returns a new user agent
        :return: a new user agent
        """
        return next(AlfredProxy.user_agent_gen)

    @staticmethod
    def get_chrome_options():
        """
        Returns a ChromeOptions object with a new proxy
        :return:
        """
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location = os.environ.get('GOOGLE_CHROME_SHIM', get_chrome_path()) #  '/app/.apt/usr/bin/google-chrome'
        chrome_options.add_argument(f'--user-agent={AlfredProxy.get_user_agent()}')
        chrome_options.add_argument(f'--proxy-server={AlfredProxy.get_proxy()}')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('headless')

        return chrome_options

    @staticmethod
    def get_firefox_options():
        """
        Returns a FirefoxOptions object with a new proxy
        :return:
        """
        capabilities = webdriver.DesiredCapabilities().FIREFOX
        #capabilities['marionette'] = False

        options = Options()
        options.headless = True
        #options.binary_location = get_firefox_path()
        os.environ['MOZ_HEADLESS'] = '1'
        binary = FirefoxBinary(get_firefox_binpath())
        return {
            'capabilities': capabilities,
            'options': options,
            'firefox_binary': binary,
            'executable_path': get_firefox_exec_path(),
        }

    @staticmethod
    def get_webdriver(firefox=True):
        if firefox:
            options = AlfredProxy.get_firefox_options()
            user_agent = webdriver.Firefox(**options)
        else:
            options = AlfredProxy.get_chrome_options()
            user_agent = webdriver.Chrome(executable_path=get_chromedriver_exec_path(), options=options)
        return user_agent

    @staticmethod
    def get(url, attempts=0):
        """
        Gets the contents of an URL changing the headers of the user-agent
        :param url: the url to visit
        :param attempts: number of attempts to try
        :return: the URL contents
        """
        if attempts > 10:
            raise Exception()
        proxies = {http_prefix: f'{http_prefix}://{proxy}'
                   for http_prefix, proxy in zip(['http', 'https'], [AlfredProxy.get_proxy()] * 2)}
        user_agent = next(AlfredProxy.user_agent_gen)
        headers = {'User-Agent': user_agent}
        try:
            logger.info(f'UA: {user_agent}')
            result = get_url(url, timeout=MY_TIMEOUT, headers=headers, proxies=proxies)
            if result.status_code != 200:
                raise ConnectionError()
            return result
        except ConnectionError as e:
            return AlfredProxy.get(url, attempts + 1)
        except Exception as e:
            return AlfredProxy.get(url, attempts + 1)


def get_url_through_proxy(url, selector=None, num_attempts=0,  **kwargs):
    """
    Visits the url for getting the HTML response,
    uses a proxy anc changes user-agent if necessary
    :param url: the url
    :return: the raw HTML response
    """
    if num_attempts == 5:
        raise ConnectionAttemptsError(msg=['Demasiados intentos a través de proxy'])

    user_agent_driver = AlfredProxy.get_webdriver()

    try:
        if selector:
            wait = WebDriverWait(user_agent_driver, MY_TIMEOUT)
            user_agent_driver.get(url)
            condition = EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            element = wait.until(condition)
        else:
            user_agent_driver.implicitly_wait(MY_TIMEOUT)
            user_agent_driver.get(url)
        return user_agent_driver.page_source
    except TimeoutException:
        logger.error('Timeout error')
        return get_url_through_proxy(url, selector, num_attempts=num_attempts+1)
    except Exception as e:
        logger.error(f"Proxy error: {type(e)}")
        return get_url_through_proxy(url, selector, num_attempts=num_attempts + 1)


# Some test code    
if __name__ == '__main__':
    get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f', '#centerCol')
    print('-'*40)
    get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f')

Python / Selenium с герою: не может убить завершившийся процесс

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Python / Selenium с герою: не может убить завершившийся процесс

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Нет похожих вопросов