Я испытываю очень необычное поведение, которое, кажется, не может обернуться.По какой-то причине выходные данные различаются в зависимости от способа получения ссылки на следующую страницу. Почему это так?
Ниже показан паук, а под ним - два разных метода поиска и вывода для каждого метода.
Паук
import scrapy
class AmazonSpider(scrapy.Spider):
name = 'amazon'
allowed_domains = [ 'amazon.ca' ]
custom_settings = { 'USER_AGENT' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }
handle_httpstatus_list = [ 404 ]
def start_requests(self):
pages = [ 'https://www.amazon.ca/b/ref=sr_aj?node=2055586011' ]
for page in pages:
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
if response.status != 404:
cont_path = '//div[@class="a-row a-spacing-none"]/span[@class="a-size-small a-color-secondary a-text-strike"]/ancestor::div[@class="s-item-container"]'
cont_resp = response.xpath(cont_path).extract()
for idx in range(1, len(cont_resp)+1):
imag_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-base"]/div/div/a/img/@src').extract_first()
titl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[1]/a/@title').extract_first()
prod_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[2]/span[2]/text()').extract_first()
pric_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/a/span/text()').extract_first()
retl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/span[@class="a-size-small a-color-secondary a-text-strike"]/text()').extract_first()
thrd_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[4]/a/span[2]/text()').extract_first()
rtng_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/span/span/a/i/span/text()').extract_first()
vots_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/a/text()').extract_first()
obj = {
'idx' : idx,
'img' : imag_resp,
'ttl' : titl_resp,
'pdr' : prod_resp,
'rtg' : rtng_resp,
'vts' : vots_resp,
}
if pric_resp is not None:
obj['prc'] = float(pric_resp.split()[1])
if retl_resp is not None:
obj['rtl'] = float(retl_resp.split()[1])
if 'prc' not in obj and thrd_resp is not None:
obj['prc'] = float(thrd_resp.split()[1])
if 'prc' in obj and 'rtl' in obj:
obj['dsc'] = str(int(round(100 - (obj['prc'] / obj['rtl'] * 100)))) + '%'
yield obj
"""
### Method #1
yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)
### Method #2
next_path = '//a[@id="pagnNextLink"]/@href'
next_resp = response.xpath('next_path').extract_first()
if next_resp is not None:
next_resp = response.urljoin(next_resp)
yield scrapy.Request(next_resp, callback=self.parse)
"""
Метод № 1
yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)
Первый вывод
[
{
"rtg": "5 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
"rtl": 779.99,
"pdr": "Pentax",
"vts": "2",
"prc": 639.99,
"ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
"dsc": "18%"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
"rtl": 33.85,
"pdr": "Sony",
"vts": "660",
"prc": 30.35,
"ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
"dsc": "10%"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 5,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 6,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
"rtl": 13.19,
"pdr": "Western Digital",
"vts": "301",
"prc": 9.99,
"ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
"dsc": "24%"
},
{
"rtg": "4.4 out of 5 stars",
"idx": 7,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
"rtl": 19.99,
"pdr": "Canon",
"vts": "20",
"prc": 18.99,
"ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
"dsc": "5%"
}
]
[
{
"rtg": "5 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
"rtl": 779.99,
"pdr": "Pentax",
"vts": "2",
"prc": 639.99,
"ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
"dsc": "18%"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
"rtl": 33.85,
"pdr": "Sony",
"vts": "660",
"prc": 30.35,
"ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
"dsc": "10%"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 5,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 6,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
"rtl": 13.19,
"pdr": "Western Digital",
"vts": "301",
"prc": 9.99,
"ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
"dsc": "24%"
},
{
"rtg": "4.4 out of 5 stars",
"idx": 7,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
"rtl": 19.99,
"pdr": "Canon",
"vts": "20",
"prc": 18.99,
"ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
"dsc": "5%"
}
]
Метод № 2
next_path = '//a[@id="pagnNextLink"]/@href'
next_resp = response.xpath('next_path').extract_first()
if next_resp is not None:
next_resp = response.urljoin(next_resp)
yield scrapy.Request(next_resp, callback=self.parse)
Второй выход
[
{
"rtg": "4.2 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51mInH2UC-L._AA160_.jpg",
"rtl": 8.93,
"pdr": "Swingline",
"vts": "24",
"prc": 7.48,
"ttl": "Swingline Optima Premium Staples, 0.25 Inch Leg Length, 45 Sheet Capacity, 3,750 Staples per Box, Silver (S7035556)",
"dsc": "16%"
}
]
Почему выходные данные зависят от способа получения ссылки на следующую страницу?