Я немного скорректировал код, и он, кажется, работает.Изменения: оператор
import random
, поскольку он используется и не будет работать без него.
Внутри цикла product_title
эти строки удалены:
ff.quit()
, refresh_page(url)
и break
Оператор ff.quit()
вызовет фатальную ошибку (соединение), приводящую к разрыву скрипта.
Также is
изменено на ==
для if count + 1 == len(item):
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import random
""" Set Global Variables
"""
ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36']
already_scraped_product_titles = []
""" Create Instances of WebDriver
"""
def create_webdriver_instance():
ua_string = random.choice(ua_strings)
profile = webdriver.FirefoxProfile()
profile.set_preference('general.useragent.override', ua_string)
options = Options()
options.add_argument('--headless')
return webdriver.Firefox(profile)
""" Construct List of UA Strings
"""
def fetch_ua_strings():
ff = create_webdriver_instance()
ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]')
for ua_string in ua_strings_ff_eles:
if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text:
ua_strings.append(ua_string.text)
ff.quit()
""" Build Lists of Product Page URLs
"""
def initiate_crawl():
def refresh_page(url):
ff = create_webdriver_instance()
ff.get(url)
ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
items = WebDriverWait(ff, 15).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
)
print(items)
for count, item in enumerate(items):
slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
# For Groups of Items on Sale
# active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")]
if len(slashed_price) > 0 and len(active_deals) > 0:
product_title = item.find_element(By.ID, 'dealTitle').text
if product_title not in already_scraped_product_titles:
already_scraped_product_titles.append(product_title)
url = ff.current_url
# Scrape Details of Each Deal
#extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href'))
print(product_title[:10])
# This ff.quit()-line breaks connection which breaks things.:
#ff.quit()
# And why
#refresh_page(url)
#break
# 'is' tests for object equality; == tests for value equality:
if count+1 == len(items):
try:
print('')
print('new page')
next_button = WebDriverWait(ff, 15).until(
EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
)
ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
time.sleep(3)
url = ff.current_url
print(url)
print('')
ff.quit()
refresh_page(url)
except Exception as error:
"""
ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click()
url = ff.current_url
ff.quit()
refresh_page(url)
"""
print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next→")')
print('Because of... {}'.format(error))
ff.quit()
refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')
#def extract_info(ff, url):
fetch_ua_strings()
initiate_crawl()