Question

Я не уверен, почему, но мой скрипт всегда перестает сканировать, как только он достигает стр. 9 .Нет ошибок, исключений или предупреждений, поэтому я немного растерялся.

Может кто-нибудь мне помочь?

PS Вот полный сценарий на случай, если кто-нибудьхочет проверить это на себе!

def initiate_crawl():
    def refresh_page(url):
        ff = create_webdriver_instance()
        ff.get(url)
        ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
        ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
        items = WebDriverWait(ff, 15).until(
            EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
        )
        print(len(items))
        for count, item in enumerate(items):
            slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
            active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
            if len(slashed_price) > 0 and len(active_deals) > 0:
                product_title = item.find_element(By.ID, 'dealTitle').text
                if product_title not in already_scraped_product_titles:
                    already_scraped_product_titles.append(product_title)
                    url = ff.current_url
                    ff.quit()
                    refresh_page(url)
                    break
            if count+1 is len(items):
                try:
                    next_button = WebDriverWait(ff, 15).until(
                        EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
                    )
                    ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
                    url = ff.current_url
                    ff.quit()
                    refresh_page(url)
                except Exception as error:
                    print(error)
                    ff.quit()

    refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')

initiate_crawl()

Печать длины items также вызывает некоторое странное поведение.Вместо того, чтобы всегда возвращать 32, что соответствует количеству элементов на каждой странице, он печатает 32 для первой страницы, 64 для второй, 96 для третьей и так далее, и так далее.Я исправил это, используя //div[contains(@id, "100_dealView_")]/div[contains(@class, "dealContainer")] вместо //div[contains(@id, "100_dealView_")] в качестве XPath для переменной items.Я надеюсь, что это является причиной, по которой это сталкивается с проблемами на странице 9. Я сейчас запускаю тесты. Обновление: Теперь она очищает страницу 10 и далее, поэтому проблема устранена.

Wieger · Answer 1 · 10 октября 2018

Я немного скорректировал код, и он, кажется, работает.Изменения: оператор

import random, поскольку он используется и не будет работать без него.

Внутри цикла product_title эти строки удалены:

ff.quit(), refresh_page(url) и break

Оператор ff.quit() вызовет фатальную ошибку (соединение), приводящую к разрыву скрипта.

Также is изменено на == для if count + 1 == len(item):

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import random



""" Set Global Variables
"""
ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36']
already_scraped_product_titles = []



""" Create Instances of WebDriver
"""
def create_webdriver_instance():
    ua_string = random.choice(ua_strings)
    profile = webdriver.FirefoxProfile()
    profile.set_preference('general.useragent.override', ua_string)
    options = Options()
    options.add_argument('--headless')
    return webdriver.Firefox(profile)

""" Construct List of UA Strings
"""
def fetch_ua_strings():
    ff = create_webdriver_instance()
    ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
    ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]')
    for ua_string in ua_strings_ff_eles:
        if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text:
            ua_strings.append(ua_string.text)
    ff.quit()

""" Build Lists of Product Page URLs
"""
def initiate_crawl():
    def refresh_page(url):
        ff = create_webdriver_instance()
        ff.get(url)
        ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
        ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
        items = WebDriverWait(ff, 15).until(
            EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
        )
        print(items)
        for count, item in enumerate(items):
            slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
            active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
            # For Groups of Items on Sale
            # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")]
            if len(slashed_price) > 0 and len(active_deals) > 0:
                product_title = item.find_element(By.ID, 'dealTitle').text
                if product_title not in already_scraped_product_titles:
                    already_scraped_product_titles.append(product_title)
                    url = ff.current_url
                    # Scrape Details of Each Deal
                    #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href'))
                    print(product_title[:10])
                    # This ff.quit()-line breaks connection which breaks things.:
                    #ff.quit()
                    # And why 
                    #refresh_page(url)
                    #break
            # 'is' tests for object equality; == tests for value equality:
            if count+1 == len(items):
                try:
                    print('')
                    print('new page')
                    next_button = WebDriverWait(ff, 15).until(
                        EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
                    )
                    ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()                    
                    time.sleep(3)
                    url = ff.current_url
                    print(url)
                    print('')
                    ff.quit()
                    refresh_page(url)
                except Exception as error:
                    """
                    ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click()
                    url = ff.current_url
                    ff.quit()
                    refresh_page(url)
                    """
                    print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next→")')
                    print('Because of... {}'.format(error))
                    ff.quit()

    refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')

#def extract_info(ff, url):
fetch_ua_strings()
initiate_crawl()

DebanjanB · Answer 2 · 10 октября 2018

Согласно вашему 10 ^th ревизии этого вопроса сообщение об ошибке ...

HTTPConnectionPool(host='127.0.0.1', port=58992): Max retries exceeded with url: /session/e8beed9b-4faa-4e91-a659-56761cb604d7/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000022D31378A58>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

... означает, что метод get() не удалсяповышение HTTPConnectionPool ошибка с сообщением Максимальное число попыток превышено .

Пара вещей:

Согласно обсуждению макс.Исключения, превышающие количество попыток, сбивают с толку traceback несколько вводит в заблуждение.Запросы обертки исключение для удобства пользователей.Исходное исключение является частью отображаемого сообщения.
Запросы никогда не повторяются (устанавливается retries=0 для urllib3 HTTPConnectionPool), поэтому ошибка была бы гораздо более канонической без MaxRetryError и HTTPConnectionPool ключевые слова.Таким образом, идеальный Traceback был бы:
```
NewConnectionError(<class 'socket.error'>: [Errno 10061] No connection could be made because the target machine actively refused it)
```
Подробное объяснение вы найдете в MaxRetryError: HTTPConnectionPool: максимальное количество повторных попыток превышено (вызвано ProtocolError(«Соединение прервано», ошибка (111, «Соединение отклонено»)))

Решение

Согласно Замечания к выпуску of Selenium 3.14.1 :

* Fix ability to set timeout for urllib3 (#6286)

Слияние : Утилита восстановления urllib3 не может установить время ожидания!

Заключение

Как только вы обновитесь до Selenium 3.14.1 , вы сможете установить время ожидания и увидеть канонические Tracebacks и сможете предпринять необходимые действия.

Ссылки

Несколько соответствующих ссылок:

Этот пример использования

I hВы взяли ваш полный сценарий из codepen.io - ручка от Энтони .Мне пришлось внести несколько изменений в существующий код следующим образом:

Как вы использовали:
```
ua_string = random.choice(ua_strings)
```
Вы должны обязательно импортировать random как:
```
import random
```

Вы создали переменную next_button , но не использовали ее.Я разбил следующие четыре строки:

next_button = WebDriverWait(ff, 15).until(
                EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
            )
ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()

Как:

WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→'))
ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()

Ваш модифицированный кодовый блок будет:

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import random


""" Set Global Variables
"""
ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36']
already_scraped_product_titles = []



""" Create Instances of WebDriver
"""
def create_webdriver_instance():
    ua_string = random.choice(ua_strings)
    profile = webdriver.FirefoxProfile()
    profile.set_preference('general.useragent.override', ua_string)
    options = Options()
    options.add_argument('--headless')
    return webdriver.Firefox(profile)



""" Construct List of UA Strings
"""
def fetch_ua_strings():
    ff = create_webdriver_instance()
    ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
    ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]')
    for ua_string in ua_strings_ff_eles:
        if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text:
            ua_strings.append(ua_string.text)
    ff.quit()



""" Log in to Amazon to Use SiteStripe in order to Generate Affiliate Links
"""
def log_in(ff):
    ff.find_element(By.XPATH, '//a[@id="nav-link-yourAccount"] | //a[@id="nav-link-accountList"]').click()
    ff.find_element(By.ID, 'ap_email').send_keys('anthony_falez@hotmail.com')
    ff.find_element(By.ID, 'continue').click()
    ff.find_element(By.ID, 'ap_password').send_keys('lo0kyLoOkYig0t4h')
    ff.find_element(By.NAME, 'rememberMe').click()
    ff.find_element(By.ID, 'signInSubmit').click()



""" Build Lists of Product Page URLs
"""
def initiate_crawl():
    def refresh_page(url):
    ff = create_webdriver_instance()
    ff.get(url)
    ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
    ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
    items = WebDriverWait(ff, 15).until(
        EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
    )
    for count, item in enumerate(items):
        slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
        active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
        # For Groups of Items on Sale
        # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")]
        if len(slashed_price) > 0 and len(active_deals) > 0:
            product_title = item.find_element(By.ID, 'dealTitle').text
            if product_title not in already_scraped_product_titles:
                already_scraped_product_titles.append(product_title)
                url = ff.current_url
                # Scrape Details of Each Deal
                #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href'))
                print(product_title[:10])
                ff.quit()
                refresh_page(url)
                break
        if count+1 is len(items):
            try:
                print('')
                print('new page')
                WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→'))
                ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
                time.sleep(10)
                url = ff.current_url
                print(url)
                print('')
                ff.quit()
                refresh_page(url)
            except Exception as error:
                """
                ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click()
                url = ff.current_url
                ff.quit()
                refresh_page(url)
                """
                print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next?")')
                print('Because of... {}'.format(error))
                ff.quit()

    refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')

#def extract_info(ff, url):
fetch_ua_strings()
initiate_crawl()

Вывод на консоль: С Selenium v3.14.0 и Firefox Quantum v62.0.3 , я могу извлечь следующий вывод на консоли:

J.Rosée Si
B.Catcher 
Bluetooth4
FRAM G4164
Major Crim
20% off Oh
True Blood
Prime-Line
Marathon 3
True Blood
B.Catcher 
4 Film Fav
True Blood
Texture Pa
Westinghou
True Blood
ThermoPro 
...
...
...

Примечание : я мог бы оптимизировать ваш код и выполнить те же операции web scrapping , инициализирующие Firefox Browser Client только один раз и просматривают различные продукты и их детали.Но чтобы сохранить вашу логику и инновации Я предложил минимальные изменения, необходимые для вас.

Скрипт неожиданно прекращает сканирование без ошибок или исключений

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Решение

Заключение

Ссылки

Этот пример использования

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Скрипт неожиданно прекращает сканирование без ошибок или исключений

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Решение

Заключение

Ссылки

Этот пример использования

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы