Question

У меня есть программа очистки, мне нужно нажать кнопку «Далее» во время очистки, и я неделю назад или около того задал здесь вопрос о том, как это сделать, и получил несколько хороших ответов, но код, который я получил для ответовработает только частично.Он очистит страницу 1 и страницу 2, но вместо перехода на страницу 3 он переходит на последнюю страницу, страницу 10, и я не могу понять, почему.

import csv
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import GameItem
def process_csv(csv_file):
    data = []
    reader = csv.reader(csv_file)
    next(reader)
    for fields in reader:
        if fields[0] != "":
            url = fields[0]
        else:
            continue # skip the whole row if the url column is empty
        if fields[1] != "":
            ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
        if fields[2] != "":
            useragent = fields[2]
        data.append({"url": url, "ip": ip, "ua": useragent})
    return data
class MySpider(Spider):
    name = 'splash_spider'  # Name of Spider

    # notice that we don't need to define start_urls
    # just make sure to get all the urls you want to scrape inside start_requests function

    # getting all the url + ip address + useragent pairs then request them
    def start_requests(self):

        # get the file path of the csv file that contains the pairs from the settings.py
        with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
           # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
            requests = process_csv(csv_file)

        for req in requests:
            # no need to create custom middlewares  # just pass useragent using the headers param, and pass proxy using the meta param

            yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                    headers={"User-Agent": req["ua"]},
                    splash_url = req["ip"],
                    )

    # Scraping
    def parse(self, response):
        item = GameItem()
        for game in response.css("tr"):
            # Card Name
            yield {
                    'card_name':  game.css("a.card_popup::text").get(),
                    }

           next_page = response.css('table+ div a:nth-child(8)::attr("href")').get()
            if next_page is not None:
                yield response.follow(next_page, self.parse)

ОБНОВЛЕНИЕ # 1

SplashSpider.py

import csv
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import GameItem

# process the csv file so the url + ip address + useragent pairs are the same as defined in the file # returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
#    'ip': 'http://204.152.114.244:8050',
#    'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
#    ...
# ]
def process_csv(csv_file):
    data = []
    reader = csv.reader(csv_file)
    next(reader)
    for fields in reader:
        if fields[0] != "":
            url = fields[0]
        else:
            continue # skip the whole row if the url column is empty
        if fields[1] != "":
            ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
        if fields[2] != "":
            useragent = fields[2]
        data.append({"url": url, "ip": ip, "ua": useragent})
    return data


class MySpider(Spider):
    name = 'splash_spider'  # Name of Spider

    # notice that we don't need to define start_urls
    # just make sure to get all the urls you want to scrape inside start_requests function

    # getting all the url + ip address + useragent pairs then request them
    def start_requests(self):

        # get the file path of the csv file that contains the pairs from the settings.py
        with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
           # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
            requests = process_csv(csv_file)

        for req in requests:
            # no need to create custom middlewares
            # just pass useragent using the headers param, and pass proxy using the meta param

            yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                    headers={"User-Agent": req["ua"]},
                    splash_url = req["ip"],
                    )
    # Scraping
    def parse(self, response):
        item = GameItem()
        for game in response.css("tr[class^=deckdbbody]"):
            # Card Name
            item["card_name"] = game.css("a.card_popup::text").extract_first()
            item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
            item["price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()

            yield item
        next_page = response.css('#content > div:last-of-type > a[href]:last-of-type').get()
        if next_page is not None:
            yield response.follow(next_page_url, self.parse)

ОБНОВЛЕНИЕ # 2 (не работает правильно)

next_page = response.css('#content > div:last-of-type > a[href]:last-of-type').get()
next_page_url = next_page.xpath("@href").get()
if next_page is not None:
    yield response.follow(next_page_url, self.parse)

next_page = response.css('#content > div:last-of-type > a[href]:last-of-type').get()
if next_page is not None:
    yield response.follow(next_page, self.parse)

Matthew L Daniel · Answer 1 · 01 июня 2019

next_page = response.css('table+ div a:nth-child(8)::attr("href")').get()

Вы наверняка не хотите nth-child(8), вам нужен последний div и его последний a, который содержит атрибут href, который:

response.css("#content > div:last-of-type > a[href]:last-of-type')

Если вы хотите быть более усердным, вы должны проверить текст соответствующего <a>, чтобы убедиться, что он содержит фразу Next

tnorth2620 · Answer 2 · 06 июня 2019

Вот правильный код, необходимый для использования xpath вместо css. Работает нормально сейчас.

next_page = response.xpath('//a[contains(., "- Next>>")]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Попытка нажать кнопку «Далее» во время очистки

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Попытка нажать кнопку «Далее» во время очистки

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов