Scrapy не работает для страниц со страницей> 1, хотя приведенные ссылки верны.Мой код:
Linux, Debian 9, Python 3.5, MongoDB, Scrapy, Scrapy-Splash code
import scrapy
import copy
import datetime
import json
import pymongo
from webscrapy.items import WebscrapyItem
from scrapy.conf import settings
from bson.objectid import ObjectId
class YooxSpiderSpider(scrapy.Spider):
name = 'yoox-spider'
allowed_domains = ['yoox.com']
base_url = 'https://www.yoox.com'
job = {}
start_url = ''
splash_url = ''
connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
def __init__(self, job_id):
self.job = self.db.jobs.find_one({'_id':ObjectId(job_id)})
self.start_url = self.job['start_url']
self.splash_url = self.job['splash_url']
def start_requests(self):
# job['start_url'] - This is the starting link for the desired category, for example Yoox/Woman or Yoox/Men
print("------------- start ---------")
yield scrapy.Request(url=''.join((self.splash_url, self.start_url)), callback=self.parse)
def parse(self, response):
for cat in [response.xpath(
"//div[@id='teleyooxCategories']/div[@class='teleyoox-section-content']/div[@class='teleyoox-section-padding']/ul[@class='text-size-default']/li")[0]]:
#url_category = response.urljoin('/render.html?url=https://www.yoox.com' + cat.xpath('./a/@href').extract_first())
sub_url_category = cat.xpath('./a/@href').extract_first()
if sub_url_category:
url_category = ''.join((self.base_url, cat.xpath('./a/@href').extract_first()))
Item = WebscrapyItem()
Item['job_id'] = self.job['_id']
Item['basecat'] = self.job['basecat']
Item['gender'] = self.job['gender']
Item['category'] = cat.xpath('./a/text()').extract_first().strip()
Item['url_category'] = url_category
yield scrapy.Request(url=''.join((self.splash_url, url_category)), meta={'Item': Item}, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
print('')
print('')
print(' ++++++++ current page ++++++++ ', response.url)
print('')
print('')
# Getting product references and product_id
for product in response.xpath("//div[@id='itemsGrid']/div[@id='srpage1']/div[@class='col-8-24']"):
sub_url_product = product.xpath('./div/div/a/@href').extract_first()
#url_product = response.urljoin('/render.html?url=https://www.yoox.com' + product.xpath('./div/div/a/@href').extract_first())
if sub_url_product:
Item = copy.deepcopy(response.meta['Item'])
product_id = product.xpath('./div/@id').extract_first()
price = product.xpath("./div/div[@class='itemData text-center']/a[@class='itemlink']/div[@class='price']/span/text()").extract()[-1][2:]
sizes = [size for size in product.xpath("./div/div/a[@class='itemlink']/div[@class='colorSize']/div[@class='size text-light']/span/text()").extract()]
available_products = {
'basecat': Item['basecat'],
'category': Item['category'],
'job_id': Item['job_id'],
'product_id': product_id,
}
#if not self.db.data.find(available_products).count():
#print('NEW product: ', product_id)
cutout_images = [
product.xpath("./div/div/a/img/@data-original").extract_first(),
product.xpath("./div/div/a/img/@rel").extract_first(),
]
Item['dt'] = datetime.datetime.utcnow()
Item['product_id'] = product_id
Item['url_product'] = ''.join((self.base_url, sub_url_product))
Item['data'] = {
'sku':'',
'date':'',
'cutout_image': cutout_images,
'data-category': product.xpath("./div/@data-category").extract_first(),
'microcategory': product.xpath("./div/div/a[@class='itemlink']/div[@class='microcategory font-sans']/text()").extract_first().strip(),
'description':'',
'price': price,
#currency - получаю из первого символа стоимости товара
'currency': product.xpath("./div/div[@class='itemData text-center']/a[@class='itemlink']/div[@class='price']/span/text()").extract()[-1][0],
'brand': product.xpath("./div/div/a[@class='itemlink']/div[@class='brand font-bold text-uppercase']/text()").extract_first(),
'merchant':'',
'sizes':sizes,
#response.xpath().extract_first()
}
#yield scrapy.Request(url=''.join((self.splash_url, Item['url_product'])), meta={'Item': Item}, callback=self.parse_details, dont_filter=True)
yield Item
#next_page_url = response.xpath("//div[@id='navigation-bar-top']/div[@class='col-6-24']/div[@id='pagination-lite']/a[@class='pure-menu-item nextPage js-track-me']/@href").extract_first()
next_page_url = response.xpath(
"//div[@id='navigation-bar-bottom']/div[@class='col-16-24']/ul[@class='pagination list-inline pull-right text-center js-pagination']/li[@class='next-page']/a/@href").extract_first()
if next_page_url:
print('')
print('')
print(' ++++++++ next page ++++++++ ', next_page_url)
print('')
print('')
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data, dont_filter=True)
else:
print(' ++++++++ NEXT CATEGORY ++++++++ ')
pass
def parse_details(self, response):
# Производим глубокое копирование для избежания перемешивания данных
Item = copy.deepcopy(response.meta['Item'])
#other_data = json.loads(response.xpath('//section[@id="product"]/script[@type="application/ld+json"]//text()').extract_first())
Item['details'] = {
'header': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='itemTitle']/h1/a/text()").extract_first().strip(),
'price': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='item-price']/span[@class='font-bold']/span[1]/text()").extract_first(),
'priceCurrency': response.xpath("//div[@id='itemData']/div[@id='js-item-details']/div[@id='item-price']/span[@class='font-bold']/span[2]/@content").extract_first(),
#'colorName': response.xpath("//div[@id='js-item-color-size']/div[@id='itemColors']/div[@class='dataTitleBox font-bold text-uppercase text-size-xs margin-bottom']/span[@class='select-color-size-label']/text()").extract_first(),
#'reference': response.xpath("//div[@class='info-section']/div[@class='product-info-wrapper _product-info'][1]/p[@class='product-color']/span[2]/text()").extract_first(),
'description': response.xpath("//div[@id='itemContent']/div[@class='row text-size-default info-2cols']/div[@class='info-col-1 item-info-column col-1-2']/ul/li[@id='itemDescription']/div[@class='info-body font-sans padding-half-top']/text()").extract_first(),
#'sizeList': response.xpath("//div[@id='js-item-details']/div[@id='js-item-color-size']/div[@id='itemSizes']/ul").extract_first(),
#'other_data': other_data,
}
print('')
print('')
print(Item)
print('')
print('')
yield Item
Парсинг работает только для первых страниц всех категорий, хотя есть код иотносится к callback = pars_data:
if next_page_url:
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data)
else:
pass
На URL консоли отображается сообщение для новой страницы = 2, но каждый URL текущей страницы является новой категорией (((. Пожалуйста, помогите моей.
++++++++ следующая страница ++++++++ https://***/us/women/shoponline/underwear_mc/2#/dept=women&gender=D&page=2&attributes=%7b%27ctgr%27%3a%5b%27ntm%27%5d%7d&season=X