Как скрести номер позиции позиции в скрапе - PullRequest
0 голосов
/ 15 ноября 2018

Как получить номер позиции с этого сайта

веб-сайт: http://books.toscrape.com/

Пожалуйста, проверьте этот скриншот

https://prnt.sc/lim3zl

# -*- coding: utf-8 -*-

import scrapy


class ToscrapeSpider(scrapy.Spider):
    name = 'toscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        lists=response.css('li.col-xs-6')
        for lis in lists:
            title=lis.xpath('.//h3//@title').extract_first()
            price=lis.xpath('.//[@class="price_color"]//text()').extract_first()
            # I need to know How to scrape there position
            position=''

            yield {
                    'Title':title,
                    'Price':price,
                    'Position':position

            }
        # next=response.xpath('//*[@class="next"]//@href').extract_first()
        # next=response.urljoin(next)
        # if next:
        #     yield scrapy.Request(next)

Ответы [ 4 ]

0 голосов
/ 16 ноября 2018

Яш Покар

Можете ли вы проверить этот код, пожалуйста

Как я могу применить ваш метод в этом селене> код scrapy

- - кодирование: utf-8 - -

from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException


class ToscrapeSpider(Spider):
    name = 'toscrape'
    allowed_domains = ['books.toscrape.com']
    # start_urls = ['http://books.toscrape.com/']

    def start_requests(self):
        self.driver = webdriver.Chrome()
        self.driver.get('http://books.toscrape.com/')
        sel = Selector(text=self.driver.page_source)
        lists=sel.css('li.col-xs-6')
        for i, lis in enumerate(lists):
            position=i+1
            links=lis.xpath('.//h3//a//@href').extract_first()
            links="http://books.toscrape.com/catalogue/"+links
            yield Request(links,meta={'position':position},callback=self.parse_page)

        while True:
            try:
                next_page=self.driver.find_element_by_xpath('//*[@class="next"]//a')
                self.logger.info('Sleeping for 10 seconds.')
                next_page.click()
                sel = Selector(text=self.driver.page_source)
                lists=sel.css('li.col-xs-6')
                for i, lis in enumerate(lists):
                    position=i+1
                    links=lis.xpath('.//h3//a//@href').extract_first()
                    links="http://books.toscrape.com/catalogue/"+links
                    yield Request(links,meta={'position':position},callback=self.parse_page)

            except NoSuchElementException:
                self.logger.info('No more pages to load.')
                self.driver.quit()
                break

    def parse_page(self, response):
        title=response.xpath('//h1//text()').extract_first()
        positions=response.meta['position']

        yield {

                'Title':title,
                'Position':positions


                }
0 голосов
/ 15 ноября 2018

Вы можете просто использовать переменную класса для отслеживания положения, например:

import scrapy

class ToscrapeSpider(scrapy.Spider):

    name = 'toscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    position = 0

    def parse(self, response):

        lists = response.css('li.col-xs-6')

        for lis in lists:

            title = lis.xpath('.//h3//@title').extract_first()
            price = lis.xpath('.//p[@class="price_color"]//text()').extract_first()

            self.position += 1

            yield {
                'Title': title,
                'Price': price,
                'Position': self.position,
            }

        next = response.xpath('//li[@class="next"]/a/@href').extract_first()
        next = response.urljoin(next)
        if next:
            yield scrapy.Request(next)

Тогда:

scrapy runspider myspider.py -o out.json

Файл out.json содержит:

[
{"Title": "A Light in the Attic", "Price": "\u00a351.77", "Position": 1},
{"Title": "Tipping the Velvet", "Price": "\u00a353.74", "Position": 2},
{"Title": "Soumission", "Price": "\u00a350.10", "Position": 3},
{"Title": "Sharp Objects", "Price": "\u00a347.82", "Position": 4},
{"Title": "Sapiens: A Brief History of Humankind", "Price": "\u00a354.23", "Position": 5},
{"Title": "The Requiem Red", "Price": "\u00a322.65", "Position": 6},
{"Title": "The Dirty Little Secrets of Getting Your Dream Job", "Price": "\u00a333.34", "Position": 7},
{"Title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", "Price": "\u00a317.93", "Position": 8},
{"Title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", "Price": "\u00a322.60", "Position": 9},
{"Title": "The Black Maria", "Price": "\u00a352.15", "Position": 10},
{"Title": "Starving Hearts (Triangular Trade Trilogy, #1)", "Price": "\u00a313.99", "Position": 11},
{"Title": "Shakespeare's Sonnets", "Price": "\u00a320.66", "Position": 12},
{"Title": "Set Me Free", "Price": "\u00a317.46", "Position": 13},
{"Title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", "Price": "\u00a352.29", "Position": 14},
{"Title": "Rip it Up and Start Again", "Price": "\u00a335.02", "Position": 15},
{"Title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", "Price": "\u00a357.25", "Position": 16},
{"Title": "Olio", "Price": "\u00a323.88", "Position": 17},
{"Title": "Mesaerion: The Best Science Fiction Stories 1800-1849", "Price": "\u00a337.59", "Position": 18},
{"Title": "Libertarianism for Beginners", "Price": "\u00a351.33", "Position": 19},
{"Title": "It's Only the Himalayas", "Price": "\u00a345.17", "Position": 20}
]
0 голосов
/ 16 ноября 2018
import scrapy


class ToscrapeSpider(scrapy.Spider):
    name = 'toscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com/']

    def parse(self, response):
        products_count = response.meta.get('products_count', 0)

        products = response.xpath('//article[@class="product_pod"]')

        for idx, product in enumerate(products):
            _image_container = product.xpath('.//div[@class="image_container"]')

            detail_page_url = _image_container.xpath('.//a/@href').extract_first()
            image = _image_container.xpath('.//img/@src').extract_first()

            name = product.xpath('.//h3/a/@title').extract_first()

            ratings = product.xpath('.//p[contains(@class, "star-rating")]/@class').extract_first()
            ratings = ratings.replace('star-rating', '').strip() if ratings else ratings

            price = product.xpath('.//p[@class="price_color"]/text()').extract_first()
            availability = product.xpath('.//p[@class="instock availability"]//text()').extract()
            availability = list(map(lambda x: x.replace('\n', '').replace('\t', '').strip(), availability))
            availability = list(filter(lambda x: x, availability))

            availability = availability[0] if availability else availability

            yield dict(
                position=products_count + idx + 1,
                name=name,
                availability=availability,
                price=price,
                ratings=ratings,
                image=image,
                pdp_url=detail_page_url,
            )

        next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()

        if next_page:
            yield response.follow(next_page, meta=dict(products_count=products_count + len(products)))
0 голосов
/ 15 ноября 2018

Попробуйте использовать enumerate в цикле, это решит проблему.Насколько я помню, примерно так:

for i, lis in enumerate(lists):
        position = i + 1
...