Я пытаюсь получить сведения о продукте из https://www.laptopsdirect.co.uk. Я могу проанализировать необходимые поля, такие как название продукта и ссылки, с помощью функции сканирования сканирования. Но когда я бегу за пауком, результаты просто случайны. Вот мой код.
import scrapy
from ..items import LaptopdirectItem
from bs4 import BeautifulSoup
import re
from datetime import datetime
class LaptopdirectSpiderSpider(scrapy.Spider):
name = 'laptopdirectspider'
page_number = 1
shelf_number = 1
main_item_url = ''
start_urls = ['https://www.laptopsdirect.co.uk/ct/laptops-and-netbooks/laptops']
def __init__(self):
self.declare_xpath()
pass
#All the XPaths the spider needs to know go here
def declare_xpath(self):
#self.getAllItemsXpath = '//*[@id="products"]/div[1]/div[5]/div[1]/h3/a/@href'
self.PriceXpath = '//*[@id="topcontent"]/div[3]/span[1]/img/@alt'
#self.PriceIncOfferXpath = '//*[@id="topcontent"]/div[3]/div[2]/div/div/div/p/label/text()'
self.ProductNameXpath = '//*[@id="ProductTitle"]/h1/span[1]/text()'
self.ProductLocationXpath = '//*[@id="productpagetop"]/div[1]'
self.RemoteKeyXpath = '//*[@id="prodPage_QuickFind"]/text()'
self.DescriptionXpath = '//*[@id="topSummary"]/ul'
#self.InStockYesOrNoXpath = '//*[@id="availability_1425911"]/div[1]/span[1]/text()'
#self.TechSpecsXpath = '//*[@id="specData"]'
#self.FeaturesXpath = "//*[@id='tab-content-product-description']/div/div[3]/ul/li/text()"
#self.BackupFeaturesXpath = "//*[@id='tab-content-product-description']/div/div[contains (@class, 'long-description')]//p/text()"
#self.SpecsNameXpath = "//*[@id='product-attribute-specs-table']/tbody/tr/th/text()"
#self.SpecsXpath = "//*[@id='product-attribute-specs-table']/tbody/tr/td/text()"
#From the main page, call parse_category on each category link
#From each subcategory page, call parse_main_item on each product page link
def parse(self,response):
for href in response.css('.offerboxtitle').xpath('@href').extract():
url = response.urljoin(href)
LaptopdirectSpiderSpider.main_item_url = url
yield scrapy.Request(url,callback=self.parse_main_item)
def parse_main_item(self,response):
item = LaptopdirectItem()
#grab each element using XPath, then clean and format text.
collection_date_of_attributes = str(datetime.now().replace(microsecond=0))
collection_date_of_price = str(datetime.now().replace(microsecond=0))
price = response.xpath(self.PriceXpath).extract_first()
price = price.replace('£','')
#price_inc_offer = scrapy.Field()
product_location = response.xpath(self.ProductLocationXpath).extract()
product_location = ','.join(map(str, product_location))
product_location = self.cleanText(self.parseText(product_location))
product_name = response.xpath(self.ProductNameXpath).extract_first()
remotekey = response.xpath(self.RemoteKeyXpath).extract_first()
remotekey = ''.join([t for t in remotekey if t.isdigit()])
#retailer = scrapy.Field()
shelf_index = LaptopdirectSpiderSpider.shelf_number
LaptopdirectSpiderSpider.shelf_number += 1
availability_str = f'availability_{remotekey}'
print('***************************************************',availability_str)
instore_xpath = f'//*[@id="{availability_str}"]/div[1]/span[1]/text()'
print('***************************************************',instore_xpath)
instock_yes_or_no = response.xpath(instore_xpath).extract_first()
product_description = response.xpath(self.DescriptionXpath).extract()
product_description = ','.join(map(str, product_description))
product_description = self.cleanText(self.parseText(product_description))
#tech_spec = response.xpath(self.TechSpecsXpath).extract()
#tech_spec = ','.join(map(str, tech_spec))
#tech_spec = self.cleanText(self.parseText(tech_spec))
productpageurl = LaptopdirectSpiderSpider.main_item_url
#Put each element into its item attribute.
item['collection_date_of_attributes'] = collection_date_of_attributes
item['collection_date_of_price'] = collection_date_of_price
item['price'] = price
item['product_location'] = product_location
item['product_name'] = product_name
item['remotekey'] = remotekey
item['shelf_index'] = shelf_index
item['instock_yes_or_no'] = instock_yes_or_no
item['product_description'] = product_description
#item['tech_spec'] = tech_spec
item['productpageurl'] = productpageurl
return item
То, что я имею в виду под случайным языком, похоже на предположим, что на первой странице 20 ноутбуков, и этот скрипт возвращает мне 20 ноутбуков, но несколько разных. Может ли кто-нибудь помочь мне с этим?