Я написал следующий код для очистки Booking.com с указанием названия города. В идеале программа должна выяснить все отели, которые есть в городе, и вычистить все отзывы по каждому отелю. К сожалению, это скребет только несколько отелей и только первые 75 отзывов об этих отелях. Подскажите, пожалуйста, что я здесь делаю не так?
import scrapy
from scrapy import Spider
from scrapy.loader import ItemLoader
from booking_spider.items import BookingSpiderItem
class PerhotelrevSpider(Spider):
name = 'perhotelrev'
allowed_domains = ['booking.com']
#start_urls = ['https://booking.com/reviews/us/hotel/maison-st-charles-quality-inn-suites.html?/']
start_urls = ['https://www.booking.com/searchresults.html?ss=New%20Orleans&']
#handle_httpstatus_list = [301, 302]
def parse(self, response):
all_hotels = response.xpath('.//*[@class="sr-hotel__title \n"]')
for ahotel in all_hotels:
hotel_name = ahotel.xpath('.//*[@class="sr-hotel__name\n"]/text()').extract_first().replace('\n','')
hotel_url = ahotel.xpath('.//*[@class="hotel_name_link url"]/@href').extract_first().replace('\n','')
full_hotel_url = 'https://www.booking.com'+str(hotel_url)
request = scrapy.Request(full_hotel_url, callback = self.parse_hotels)
request.meta['adict'] = {'HotelName':hotel_name}
yield request
next_page = response.xpath('.//*[@class="bui-pagination__item bui-pagination__next-arrow"]/a/@href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_hotels(self, response):
adict = response.meta['adict']
hotel_name = adict['HotelName']
#hotel_name = response.xpath('.//*[@class="hp__hotel-name"]/text()')[1].extract().replace('\n','')
image_urls = response.xpath('.//*[@class="b_nha_hotel_small_images hp_thumbgallery_with_counter"]/a/@href').extract()
all_facilities = response.xpath('.//*[@class="facilitiesChecklistSection"]/ul/li/span/text()').extract()
all_facilities = [x.replace('\n','') for x in all_facilities]
important_facility = response.xpath('.//*[@class="important_facility "]/@data-name-en').extract()
#print(hotel_name)
all_review_url = response.xpath('.//*[@class="show_all_reviews_btn"]/@href').extract_first()
adict = { 'HotelName':hotel_name,
'ImageUrls':image_urls,
'Facilities':all_facilities,
'MainFacilities':important_facility
}
if all_review_url is not None:
review_url = "https://booking.com"+all_review_url
request = scrapy.Request(review_url, callback=self.parse_review)
request.meta['adict'] = adict
yield request
def parse_review(self, response):
allreviewsinpage = response.xpath('.//*[@itemprop="review"]')
adict = response.meta['adict']
hotel_name = adict['HotelName']
image_urls = adict['ImageUrls']
all_facilities = adict['Facilities']
important_facility = adict['MainFacilities']
for eachreview in allreviewsinpage:
username = eachreview.xpath('.//p[@class="reviewer_name"]/*[@itemprop="name"]/text()').extract_first()
usercountry = eachreview.xpath('.//*[@itemprop="nationality"]/*[@itemprop="name"]/text()').extract_first()
numreviewgiven = eachreview.xpath('.//*[@class="review_item_user_review_count"]/text()').extract_first()
useragegroup = eachreview.xpath('.//*[@class="user_age_group"]/text()').extract_first()
heading = eachreview.xpath('.//*[@class="review_item_header_content\n"]/*[@itemprop="name"]/text()').extract_first()
neg_rev = eachreview.xpath('.//p[@class="review_neg "]/*[@itemprop="reviewBody"]/text()').extract_first()
pos_rev = eachreview.xpath('.//p[@class="review_pos "]/*[@itemprop="reviewBody"]/text()').extract_first()
tagging = eachreview.xpath('.//ul[@class="review_item_info_tags"]/*[@class="review_info_tag "]/text()').extract()
stayedin = eachreview.xpath('.//p[@class="review_staydate "]/text()').extract_first()
givenscore = eachreview.xpath('.//span[@class="review-score-badge"]/text()').extract_first()
l = ItemLoader(item=BookingSpiderItem(), selector=response)
l.add_value('HotelName',hotel_name)
#l.add_value('ImageUrls',image_urls)
l.add_value('Facilities',all_facilities)
l.add_value('MainFacilities',important_facility)
l.add_value('UserName',username)
l.add_value('UserCountry',usercountry)
l.add_value('NumReviewGiven',numreviewgiven)
l.add_value('UserAgeGroup',useragegroup)
l.add_value('Heading',heading)
l.add_value('NegativeReview',neg_rev)
l.add_value('PositiveReview',pos_rev)
l.add_value('SelfTag',tagging)
l.add_value('StayDate',stayedin)
l.add_value('GivenScore',givenscore)
yield l.load_item()
next_page = response.xpath('.//*[@class="page_link review_next_page"]/a/@href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse_review)