это рабочий код, пожалуйста попробуйте:
class MoviesSpider():
name = 'movies' #name of the spider
allowed_domains = ['imdb.com']
start_url = 'http://imdb.com/list/ls055386972/'
def __init__(self):
super(MoviesSpider, self).__init__()
def start_requests(self):
yield Request(self.start_url, callback=self.parse, headers=self.headers)
def parse(self, response):
#events = response.xpath('//*[@property="url"]/@href').extract()
links = response.xpath('//h3[@class]/a/@href').extract()
final_links = []
for link in links:
final_link = 'http://www.imdb.com' + link
final_links.append(final_link)
for final_link in final_links:
absolute_url = response.urljoin(final_link)
yield Request(absolute_url, callback = self.parse_movies)
#process next page url
#next_page_url = response.xpath('//a[text() = "Next"]/@href').extract_first()
#absolute_next_page_url = response.urljoin(next_page_url)
#yield Request(absolute_next_page_url)
def parse_movies(self, response):
title = response.xpath('//div[@class = "title_wrapper"]/h1[@class]/text()').extract_first()
yield{
'title': title,
}