Question

Я довольно плохо знаком с терапией и ищу решение для моих личных упражнений. То, что я пытаюсь сделать, - это сканировать фильмы IMDB, чтобы получить рейтинг, название, год и сюжет. Мне удается просматривать ссылки и сканировать страницы фильмов, но я не могу найти способ получить рейтинг для каждого фильма.

В настоящее время мой код выглядит так:

import scrapy
from tutorial.items import IMDB_dict # We need this so that Python knows about the item object

class MppaddressesSpider(scrapy.Spider):
name = "mppaddresses" # The name of this spider

# The allowed domain and the URLs where the spider should start crawling:
allowed_domains = ["imdb.com"]
start_urls = ['https://www.imdb.com/chart/top/']

def parse(self, response):
    # The main method of the spider. It scrapes the URL(s) specified in the
    # 'start_url' argument above. The content of the scraped URL is passed on
    # as the 'response' object.
    for rank in response.xpath(".//tbody[@class='lister-list']/tr/td[@class='titleColumn']/text()").extract():
        rank=" ".join(rank.split())
        item = IMDB_dict()
        item['rank'] = rank

    for url in response.xpath(".//tbody[@class='lister-list']/tr/td[@class='titleColumn']/a/@href").extract():
        # This loops through all the URLs found inside an element of class 'mppcell'

        # Constructs an absolute URL by combining the response’s URL with a possible relative URL:
        full_url = response.urljoin(url)
        print("FOOOOOOOOOnd URL: "+full_url)

        # The following tells Scrapy to scrape the URL in the 'full_url' variable
        # and calls the 'get_details() method below with the content of this
        # URL:
        #yield {'namyy' : response.xpath(".//tbody[@class='lister-list']/tr/td[@class='titleColumn']/text()").extract().strip("\t\r\n '\""),}
        yield scrapy.Request(full_url, callback=self.get_details)

def get_details(self, response):
    # This method is called on by the 'parse' method above. It scrapes the URLs
    # that have been extracted in the previous step.

    #item = OntariomppsItem() # Creating a new Item object
    # Store scraped data into that item:
    item = IMDB_dict()
    item['name'] = response.xpath(".//div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
    item['phone'] = response.xpath(".//div[@class='titleBar']/div[@class='title_wrapper']/h1/span[@id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
    item['email'] = response.xpath(".//div[@class='plot_summary ']/div[@class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
    # Return that item to the main spider method:
    yield item

Кроме того, мой item.py имеет:

import scrapy

class IMDB_dict(scrapy.Item):
    # define the fields for your item here like:
    rank = scrapy.Field()
    name = scrapy.Field()
    phone = scrapy.Field()
    email = scrapy.Field()

Основной вопрос: Как получить рейтинг, связанный с названием?

Последний вопрос (если это возможно): Я могу получить доступ к URL-адресам, как я делал, когда они относительные (с помощью urljoin), но я не могу найти способ доступа к URL-адресам, когда они абсолютные ...

Большое спасибо за вашу помощь.

Лучшее

gangabass · Answer 1 · 06 сентября 2018

Вам необходимо отправить rank на ваш get_details обратный вызов, используя meta:

def parse(self, response):

    for movie in response.xpath(".//tbody[@class='lister-list']/tr/td[@class='titleColumn']"):

        movie_rank = movie.xpath('./text()').re_first(r'(\d+)')

        movie_url = movie.xpath('./a/@href').extract_first()
        movie_full_url = response.urljoin(movie_url)
        print("FOOOOOOOOOnd URL: " + movie_url)

        yield scrapy.Request(movie_full_url, callback=self.get_details, meta={"rank": movie_rank})

def get_details(self, response):

    item = IMDB_dict()        
    item['rank'] = response.meta["rank"]
    item['name'] = response.xpath(".//div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/text()").extract_first().strip("\t\r\n '\"")
    item['phone'] = response.xpath(".//div[@class='titleBar']/div[@class='title_wrapper']/h1/span[@id='titleYear']/a/text()").extract_first().strip("\t\r\n '\"")
    item['email'] = response.xpath(".//div[@class='plot_summary ']/div[@class='summary_text']/text()").extract_first().strip("\t\r\n '\"")
    # Return that item to the main spider method:
    yield item

UPDATE Если вы проверите журналы, вы найдете эту ошибку

AttributeError: объект 'NoneType' не имеет атрибута 'strip'

Иногда .extract_first() возвращает None, и вы не можете strip() это. Я рекомендую вам использовать Scrapy Item Loader

скрап: скрести несколько предметов с 2 уровней

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

скрап: скрести несколько предметов с 2 уровней

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов