Я пытаюсь получить URL-адрес от PLP и посетить каждый из элементов, чтобы получить определенные ключевые слова из PDP и выгрузить его в файл json. Однако список возвращает только 1 данные. Я подозреваю, что сайт пытается заблокировать действие. Я использую эту программу раз в месяц, чтобы увидеть, какие новые функции добавляются в новые элементы.
Код между "***" - это та часть, с которой у меня проблемы. Он возвращает правильное значение, но возвращает только 1 данные. Как я могу получить больше данных? В приведенном ниже примере я получаю только названия продуктов, чтобы упростить его.
образец URL: "https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3"
Фактический элемент
<div class="exp-product-wall clearfix">
::before
<div class="grid-item fullSize" data-pdpurl="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf" data-column-index="0" data-item-index="1">
<div class="grid-item-box">
<div class="grid-item-content">
<div class="grid-item-image">
<div class="grid-item-image-wrapper sprite-sheet sprite-index-1">
<a href="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf">
<img src="https://images.nike.com/is/image/DotCom/pwp_sheet2?$NIKE_PWPx3$&$img0=BQ8928_001&$img1=BQ8928_003&$img2=BQ8928_005">
Ниже рабочий код
import selenium
import json
import time
import re
import string
import requests
import bs4
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
domain = 'website url goes here'
def prepare_driver(url):
'''Returns a Firefox Webdriver.'''
options = Options()
# options.add_argument('-headless')
driver = webdriver.Chrome(executable_path='location to chromedriver')
driver.get(url)
wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CLASS_NAME, 'product-name ')))
time.sleep(2)
return driver
def fill_form(driver, search_argument):
'''Finds all the input tags in form and makes a POST requests.'''
#search_field = driver.find_element_by_id('q')
#search_field.send_keys(search_argument)
# We look for the search button and click it
#driver.find_element_by_class_name('search__submit')\
#.click()
wait = WebDriverWait(driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'product-name ')))
def scrape_results(driver, n_results):
'''Returns the data from n_results amount of results.'''
products_urls = list()
products_data = list()
***for product_title in driver.find_elements_by_xpath('//div[@class="exp-gridwall-content clearfix"]'):
products_urls.append(product_title.find_element_by_xpath(
'//div[@class="grid-item fullSize"]').get_attribute('data-pdpurl'))***
for url in range(0, n_results):
if url == n_results:
break
url_data = scrape_product_data(driver, products_urls[url])
products_data.append(url_data)
return products_data
def scrape_product_data(driver, product_url):
'''Visits an product page and extracts the data.'''
if driver == None:
driver = prepare_driver(product_url)
driver.get(product_url)
time.sleep(12)
product_fields = dict()
# Get the product name
product_fields['product_name'] = driver.find_element_by_xpath(
'//h1[@id="pdp_product_title"]').get_attribute('textContent')
# .text.strip('name')
return product_fields
if __name__ == '__main__':
try:
driver = prepare_driver(domain)
#fill_form(driver, 'juniole tf')
products_data = scrape_results(driver, 2)
products_data = json.dumps(products_data, indent=4,ensure_ascii=False) #ensure_acii => changes japanese to correct character
with open('data.json', 'w') as f:
f.write(products_data)
finally:
driver.quit()
Желаемый вывод в json:
[
{
"product_name": "Nike Epic React Flyknit 2",
"descr": "The Nike Epic React Flyknit 2 takes a step up from its predecessor with smooth, lightweight performance and a bold look. An updated Flyknit upper conforms to your foot with a minimal, supportive design. Underfoot, durable Nike React technology defies the odds by being both soft and responsive, for comfort that lasts as long as you can run."
},
{
"product_name": "Nike Zoom Fly SP Fast Nathan Bell",
"descr": "The Nike Zoom Fly SP Fast Nathan Bell is part of a collaboration with artist Nathan Bell, featuring hand-drawn graphics that celebrate running as a competition with yourself. It's designed to meet the demands of your toughest tempo runs, long runs and race day with a responsive construction that turns the pressure of each stride into energy return for the next."
}
]