Я пытаюсь отказаться от IMDB и получить информацию о фильмах режиссера Стивена Спилберга или Мартина Скорсезе. Проблема в том, что строка «бюджет» в HTML коде - это div [7] для одних фильмов и div [8] для других. Я пытаюсь написать функцию, которая будет получать данные, если условие выполнено (в этом случае - извлечь, если div содержит слово «бюджет»). Ниже приведен мой код:
# Import the needed packages
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import numpy as np
### Scrap movies of Steven Spielberg ###
## IMDB_Spider1
# Create the Spider class1
class IMDB_Spider1(scrapy.Spider):
name = "IMDB_spider1"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv1_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
## IMDB_Spider2
# Create the Spider class2
class IMDB_Spider2(scrapy.Spider):
name = "IMDB_spider2"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000229/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv2_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
### Scrap movies of Martin Scorsese ###
## IMDB_Spider3
# Create the Spider class3
class IMDB_Spider3(scrapy.Spider):
name = "IMDB_spider3"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row even")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv3_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url
## IMDB_Spider4
# Create the Spider class4
class IMDB_Spider4(scrapy.Spider):
name = "IMDB_spider4"
# start_requests method
def start_requests( self ):
url = 'https://www.imdb.com/name/nm0000217/?ref_=nv_sr_srsg_0'
yield scrapy.Request( url = url,
callback = self.parse_front )
# First parsing method
def parse_front(self, response):
links_to_follow = response.xpath('//*[contains(@class,"filmo-row odd")]/b/a/@href').extract()
for url in links_to_follow:
yield response.follow(url = url,
callback = self.parse_pages)
# Second parsing method
def parse_pages(self, response):
title = response.css('h1::text').extract_first().strip()
year = response.css('#titleYear a::text').extract_first()
rating = response.css('.subtext::text').extract_first().strip() or None
users_rating = response.xpath('//span[contains(@itemprop, "ratingValue")]/text()').extract_first()
votes = response.xpath('//span[contains(@itemprop, "ratingCount")]/text()').extract_first()
metascore = response.xpath('//div[contains(@class, "metacriticScore")]/span/text()').extract_first()
countries = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Country")]/a/text()').extract()
countries = [country.strip() for country in countries]
languages = response.xpath('//div[contains(@class, "txt-block") and contains(.//h4, "Language")]/a/text()').extract()
languages = [language.strip() for language in languages]
actors = response.xpath('//td[not(@class)]/a/text()').extract()
actors = [actor.strip() for actor in actors]
tagline = response.xpath('//div[contains(string(), "Tagline")]/text()').extract()
tagline = ''.join(tagline).strip() or None
directors = response.xpath("//div[contains(@class, 'credit_summary_item') and contains(.//h4, 'Director')]/a/text()").extract() or None
directors = [director.strip() for director in directors]
runtime = response.xpath("//div[contains(@class, 'txt-block') and contains(.//h4, 'Runtime')]/time/text()").extract_first() or None
genres = response.xpath("//*[@id='titleStoryLine']/div[4]/a/text()").extract()
budget = response.xpath("//*[@id='titleDetails']/div[7]/text()").extract()
opening_week_us =response.xpath("//*[@id='titleDetails']/div[8]/text()").extract()
gross_usa = response.xpath("//*[@id='titleDetails']/div[9]/text()").extract()
world_wide_box_office = response.xpath("//*[@id='titleDetails']/div[10]/text()").extract()
imdb_url = response.url.replace('?ref_=adv_li_tt', '')
mv4_dict[title] = year, rating, users_rating, votes, metascore, countries, languages, actors, tagline, directors, runtime, genres, budget, opening_week_us, gross_usa, world_wide_box_office, imdb_url