Проблема с возвратом результатов данных с сайта, необходимого для входа, и обхода нескольких страниц с данными о продукте. - PullRequest
0 голосов
/ 13 марта 2020
Imports required to run

'''
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from shutil import which
import logging
import time
from ..items import B2BItem
'''

начало паука

'' '

class ProductsSpider(scrapy.Spider):
    name = 'products'
    page_number = 2
    allowed_domains = ['b2b.mile-stone.jp/']
    start_urls = ['https://b2b.mile-stone.jp/ja/search/0/status=preOrder,tentativePreOrder/']

    # userName = '******'
    # password = '******'

'''

начало селена для входа в систему, затем переход на страницы к начальной странице с ключевым словом поиска опции' '' def init (self): chrome_options = Options () chrome_options.add_argument ("- headless")

        chrome_path = which("chromedriver")

        driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
        # to set screen-size resolution
        driver.set_window_size(1920, 1080)
        driver.get("https://b2b.mile-stone.jp/ja/v1/login/")

        login_input = driver.find_element_by_xpath("//input[@name='account']")
        login_input.send_keys('******')

        password_input = driver.find_element_by_xpath("//input[@name='passphrase']")
        password_input.send_keys('******')

        password_input.send_keys(Keys.ENTER)




        time.sleep(5)

        # search_input = driver.find_element_by_xpath("//input[@id='param-free-form']")
        # search_input.send_keys('KADOKAWA')
        # search_input.send_keys(Keys.ENTER)



        site_map = driver.find_element_by_xpath("//div[@class='footer footer- 
        links']/div/div/div/div[3]/ul/li/a")
        site_map.click()

        time.sleep(10)

        product_path = driver.find_element_by_xpath("//div[@class='sitemap']/ul/li[4]/ul/li[1]/a")
        product_path.click()


        time.sleep(5)
        # Pre-order
        pre_order = driver.find_element_by_xpath("//ul[@id='param-status-form']/li[1]/label")
        pre_order.click()
        time.sleep(5)

        # In-stock
        # in_stock = driver.find_element_by_xpath("//ul[@id='param-status-form']/li[2]/input")
        # in_stock.click()
        # time.sleep(5)

        # Back-order
        # back_order = driver.find_element_by_xpath("//ul[@id='param-status-form']/li[3]/input")
        # back_order.click()
        # time.sleep(5)

        # Waiting for Stock
        # waiting_for_stock = driver.find_element_by_xpath("//ul[@id='param-status-form']/li[4]/input")
        # waiting_for_stock.click()
        # time.sleep(5)

        # Tentative Pre-order
        tentative_pre_order = driver.find_element_by_xpath("//ul[@id='param-status-form']/li[5]/label")
        tentative_pre_order.click()
        time.sleep(5)



        # print(driver.page_source)

        self.html = driver.page_source

        driver.close()
    '''

конец прогона без заголовка для запуска селена "crawler" '' 'def parse (self, response ): resp = Selector (text = self. html)

    #items inside product page
    items = B2BItem()

    product_url = resp.xpath("//h3[@class='h5 text-primary h-fix-32']/a/@href").extract()
    product_name = resp.xpath("//h3[@class='h5 text-primary h-fix-32']/a/font/font/text()").extract()
    stockStatus_name = resp.xpath("//h4[@class='small border-gray- 
    light']/p/span/font/font/text()").extract()
    company_name = resp.xpath("//h4[@class='small border-gray- 
    light']/p/span[2]/font/font/text()").extract()
    retail_price = resp.xpath("//div[@class='row small']/dl/dd/span/font/font/text()").extract() 
    wholesale_price = resp.xpath("//div[@class='row 
    small']/dl/dd[2]/span/span[2]/font/font/text()").extract()
    stock_date = resp.xpath("//div[@class='caption']/div/dl/dd/font/font/text()").extract()
    scheduledRelease_date = resp.xpath("//div[@class='caption']/div/dl/dd[2]/font/font/text()").extract()
    reservationDeadline_date = 
    resp.xpath("//div[@class='caption']/div/dl/dd[3]/font/font/text()").extract()
    image1_url = resp.xpath("//div[@class='sash-inline']/a/img/@src").extract()
    sales_restriction = resp.xpath("//form/p[@class='moq m-8 bg-danger text- 
      danger']/font/font/text()").extract()

      items['product_url'] = product_url
      items['product_name'] = product_name
      items['stockStatus_name'] = stockStatus_name
      items['company_name'] = company_name
      items['retail_price'] = retail_price
      items['wholesale_price'] = wholesale_price
      items['stock_date'] = stock_date
      items['scheduledRelease_date'] = scheduledRelease_date
      items['reservationDeadline_date'] = reservationDeadline_date
      items['image1_url'] = image1_url
      items['sales_restriction'] = sales_restriction

      yield items

  '''

по-прежнему возникает проблема при попытке получить информацию отсюда, кажется, что некоторые не отображаются из-за изменения входа в систему / изменения запроса на сайте? или, скорее всего, просто не знаю, как его перехватить

начало запроса на следующую страницу, здесь не удается перейти на страницу 2 и т. д. '' '

    next_page = 'https://b2b.mile-stone.jp/ja/search/0/status=preOrder,tentativePreOrder/?page=' + 
    str(ProductsSpider.page_number)
      if ProductsSpider.page_number <= 318:
        ProductsSpider.page_number += 1
        yield response.follow(next_page, callback = self.parse)
'''
...