Question

Я пытаюсь отказаться от IMDB и получить информацию о фильмах режиссера Стивена Спилберга или Мартина Скорсезе. Проблема в том, что строка «бюджет» в HTML коде - это div [7] для одних фильмов и div [8] для других. Я пытаюсь написать функцию, которая будет получать данные, если условие выполнено (в этом случае - извлечь, если div содержит слово «бюджет»). Ниже приведен мой код:

# Import the needed packages
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import numpy as np

### Scrap movies of Steven Spielberg ###
## IMDB_Spider1
# Create the Spider class1
class IMDB_Spider1(scrapy.Spider):
  name = "IMDB_spider1"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv1_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

## IMDB_Spider2
# Create the Spider class2
class IMDB_Spider2(scrapy.Spider):
  name = "IMDB_spider2"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv2_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

### Scrap movies of Martin Scorsese ###
## IMDB_Spider3
# Create the Spider class3
class IMDB_Spider3(scrapy.Spider):
  name = "IMDB_spider3"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv3_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

## IMDB_Spider4
# Create the Spider class4
class IMDB_Spider4(scrapy.Spider):
  name = "IMDB_spider4"
  # start_requests method
  def start_requests( self ):
    url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
    yield scrapy.Request( url = url,
                         callback = self.parse_front )
  # First parsing method
  def parse_front(self, response):
    links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
 # Second parsing method
  def parse_pages(self, response):
    title = response.css('h1::text').extract_first().strip()
    year = response.css('#titleYear a::text').extract_first()
    rating = response.css('.subtext::text').extract_first().strip() or None
    users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
    votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
    metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
    countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
    countries = [country.strip() for country in countries]
    languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
    languages = [language.strip() for language in languages]
    actors = response.xpath('//td[not(@class)]/a/text()').extract()
    actors = [actor.strip() for actor in actors]
    tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
    tagline = ''.join(tagline).strip() or None
    directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
    directors = [director.strip() for director in directors]
    runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
    genres =  response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
    budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
    opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract() 
    gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
    world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
    imdb_url = response.url.replace('?ref_=adv_li_tt', '')
    mv4_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url

Janib Soomro · Answer 1 · 16 января 2020

Попробуйте что-то вроде:

//*[@id='titleDetails']/div[contains(.,'Color')]

Это позволит получить данные, когда требуемый атрибут найден.

Если условие для Scrapy Spider

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Если условие для Scrapy Spider

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Нет похожих вопросов