Scrapy не работает для страниц с нумерацией страниц> 1, хотя приведенные ссылки верны - PullRequest
0 голосов
/ 25 июня 2019

Scrapy не работает для страниц со страницей> 1, хотя приведенные ссылки верны.Мой код:

Linux, Debian 9, Python 3.5, MongoDB, Scrapy, Scrapy-Splash code

import scrapy
import copy
import datetime
import json
import pymongo

from webscrapy.items import WebscrapyItem
from scrapy.conf import settings
from bson.objectid import ObjectId


class YooxSpiderSpider(scrapy.Spider):
    name = 'yoox-spider'
    allowed_domains = ['yoox.com']
    base_url = 'https://www.yoox.com'
    job = {}
    start_url = ''
    splash_url = ''
    connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                     settings['MONGODB_PORT'])
    db = connection[settings['MONGODB_DB']]



    def __init__(self, job_id):
        self.job = self.db.jobs.find_one({'_id':ObjectId(job_id)})
        self.start_url = self.job['start_url']
        self.splash_url = self.job['splash_url']


    def start_requests(self):
        # job['start_url'] - This is the starting link for the desired category, for example Yoox/Woman or Yoox/Men
        print("------------- start ---------")
        yield scrapy.Request(url=''.join((self.splash_url, self.start_url)), callback=self.parse)


    def parse(self, response):

        for cat in [response.xpath(
                "//div[@id='teleyooxCategories']/div[@class='teleyoox-section-content']/div[@class='teleyoox-section-padding']/ul[@class='text-size-default']/li")[0]]:

            #url_category = response.urljoin('/render.html?url=https://www.yoox.com' + cat.xpath('./a/@href').extract_first())
            sub_url_category = cat.xpath('./a/@href').extract_first()
            if sub_url_category:
                url_category = ''.join((self.base_url, cat.xpath('./a/@href').extract_first()))
                Item = WebscrapyItem()
                Item['job_id'] = self.job['_id']
                Item['basecat'] = self.job['basecat']
                Item['gender'] =  self.job['gender']
                Item['category'] = cat.xpath('./a/text()').extract_first().strip()
                Item['url_category'] = url_category

                yield scrapy.Request(url=''.join((self.splash_url, url_category)), meta={'Item': Item}, callback=self.parse_data, dont_filter=True)


    def parse_data(self, response):
        print('')
        print('')
        print(' ++++++++ current page ++++++++ ', response.url)
        print('')
        print('')
        # Getting product references and product_id
        for product in response.xpath("//div[@id='itemsGrid']/div[@id='srpage1']/div[@class='col-8-24']"):

            sub_url_product = product.xpath('./div/div/a/@href').extract_first()
            #url_product = response.urljoin('/render.html?url=https://www.yoox.com' + product.xpath('./div/div/a/@href').extract_first())

            if sub_url_product:

                Item = copy.deepcopy(response.meta['Item'])
                product_id = product.xpath('./div/@id').extract_first()
                price = product.xpath("./div/div[@class='itemData text-center']/a[@class='itemlink']/div[@class='price']/span/text()").extract()[-1][2:]
                sizes = [size for size in product.xpath("./div/div/a[@class='itemlink']/div[@class='colorSize']/div[@class='size text-light']/span/text()").extract()]
                available_products = {
                    'basecat': Item['basecat'],
                    'category': Item['category'],
                    'job_id': Item['job_id'],
                    'product_id': product_id,
                    }
                #if not self.db.data.find(available_products).count():
                #print('NEW product: ', product_id)
                cutout_images = [
                    product.xpath("./div/div/a/img/@data-original").extract_first(),
                    product.xpath("./div/div/a/img/@rel").extract_first(),
                ]

                Item['dt'] = datetime.datetime.utcnow()
                Item['product_id'] = product_id
                Item['url_product'] = ''.join((self.base_url, sub_url_product))
                Item['data'] = {
                    'sku':'',
                    'date':'',
                    'cutout_image': cutout_images,
                    'data-category': product.xpath("./div/@data-category").extract_first(),
                    'microcategory': product.xpath("./div/div/a[@class='itemlink']/div[@class='microcategory font-sans']/text()").extract_first().strip(),
                    'description':'',
                    'price': price,
                    #currency - получаю из первого символа стоимости товара
                    'currency': product.xpath("./div/div[@class='itemData text-center']/a[@class='itemlink']/div[@class='price']/span/text()").extract()[-1][0],
                    'brand': product.xpath("./div/div/a[@class='itemlink']/div[@class='brand font-bold text-uppercase']/text()").extract_first(),
                    'merchant':'',
                    'sizes':sizes,
                    #response.xpath().extract_first()
                }

                #yield scrapy.Request(url=''.join((self.splash_url, Item['url_product'])), meta={'Item': Item}, callback=self.parse_details, dont_filter=True)

                yield Item

        #next_page_url = response.xpath("//div[@id='navigation-bar-top']/div[@class='col-6-24']/div[@id='pagination-lite']/a[@class='pure-menu-item nextPage js-track-me']/@href").extract_first()
        next_page_url = response.xpath(
            "//div[@id='navigation-bar-bottom']/div[@class='col-16-24']/ul[@class='pagination list-inline pull-right text-center js-pagination']/li[@class='next-page']/a/@href").extract_first()

        if next_page_url:
            print('')
            print('')
            print(' ++++++++ next page ++++++++ ', next_page_url)
            print('')
            print('')
            yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data, dont_filter=True)
        else:
            print(' ++++++++ NEXT CATEGORY ++++++++ ')
            pass


    def parse_details(self, response):
        # Производим глубокое копирование для избежания перемешивания данных
        Item = copy.deepcopy(response.meta['Item'])

        #other_data = json.loads(response.xpath('//section[@id="product"]/script[@type="application/ld+json"]//text()').extract_first())
        Item['details'] = {
            'header': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='itemTitle']/h1/a/text()").extract_first().strip(),
            'price': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='item-price']/span[@class='font-bold']/span[1]/text()").extract_first(),
            'priceCurrency': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='item-price']/span[@class='font-bold']/span[2]/@content").extract_first(),
            #'colorName': response.xpath("//div[@id='js-item-color-size']/div[@id='itemColors']/div[@class='dataTitleBox font-bold text-uppercase text-size-xs margin-bottom']/span[@class='select-color-size-label']/text()").extract_first(),

            #'reference': response.xpath("//div[@class='info-section']/div[@class='product-info-wrapper _product-info'][1]/p[@class='product-color']/span[2]/text()").extract_first(),

            'description': response.xpath("//div[@id='itemContent']/div[@class='row text-size-default info-2cols']/div[@class='info-col-1 item-info-column col-1-2']/ul/li[@id='itemDescription']/div[@class='info-body font-sans padding-half-top']/text()").extract_first(),
            #'sizeList': response.xpath("//div[@id='js-item-details']/div[@id='js-item-color-size']/div[@id='itemSizes']/ul").extract_first(),

            #'other_data': other_data,
        }


        print('')
        print('')
        print(Item)
        print('')
        print('')

        yield Item

Парсинг работает только для первых страниц всех категорий, хотя есть код иотносится к callback = pars_data:

        if next_page_url:
            yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data)
        else:
            pass

На URL консоли отображается сообщение для новой страницы = 2, но каждый URL текущей страницы является новой категорией (((. Пожалуйста, помогите моей.

++++++++ следующая страница ++++++++ https://***/us/women/shoponline/underwear_mc/2#/dept=women&gender=D&page=2&attributes=%7b%27ctgr%27%3a%5b%27ntm%27%5d%7d&season=X

...