Scrap прекращается после запуска, почему? - PullRequest
0 голосов
/ 05 октября 2019

Я получил эту ошибку после запуска Python Scrapy. Это показывает, что эта утилита все на месте и начала ползти, но останавливается сразу перед сканированием первой страницы.

Я пытался сделать это несколько раз с различными настройками, но получил тот же результат, что и ниже.

    2019-10-05 04:48:32 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: booking1)
2019-10-05 04:48:32 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.3, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 2.7.14 (v2.7.14:84471935ed, Sep 16 2017, 20:19:30) [MSC v.1500 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g  2 Nov 2017), cryptography 2.1.4, Platform Windows-10-10.0.14393
2019-10-05 04:48:32 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'booking1.spiders', 'LOG_LEVEL': 'INFO', 'SPIDER_MODULES': ['booking1.spiders'], 'BOT_NAME': 'booking1', 'COOKIES_ENABLED': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-10-05 04:48:33 [scrapy.middleware] INFO: Enabled item pipelines:
['booking1.pipelines.MoveImagesPipeline', 'booking1.pipelines.MysqlPipeline']
2019-10-05 04:48:33 [scrapy.core.engine] INFO: Spider opened
LIST
2019-10-05 04:48:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
0
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Closing spider (finished)
0
2019-10-05 04:48:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 305,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 93563,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2019, 10, 5, 4, 48, 34, 80000),
 'log_count/INFO': 7,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'start_time': datetime.datetime(2019, 10, 5, 4, 48, 33, 133000)}
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Spider closed (finished)

Код Scrapy:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from booking1.items import Booking1Item, BookingImageItem
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from scrapy import signals
import re
import hashlib
import json
import scrapy.dupefilters


class BookingComSpider(scrapy.Spider):
    name = "booking.com"
    allowed_domains = ["booking.com"]
    start_urls = (
         'https://www.booking.com/searchresults.en-gb.html?city=-3006514',
        # 'http://www.booking.com/searchresults.en-gb.html?aid=376363&region=1005',

    )
    langs = []
    counts = 0
    hotel_limits = 10000000000
    all_hotel_counts = 0
    urls = []
    image_urls = []
    checkin = '2019-12-10'
    checkout = '2019-12-11'

    def __init__(self):
        self.mode = 'LIST'  # LIST for hotel list, DATA for data of hotels, IMAGES for load images
        self.start_url = 'https://www.booking.com/destination.en-gb.html'
        #self.start_url = 'http://www.booking.com/destination/city/ae/dubai.html'
        #self.start_url = 'https://www.booking.com/destination/city/nl/rotterdam.html'
        self.run =  'full'  # should be 'town' or 'country' or 'full' working only fo LIST and use self.start_url

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BookingComSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider

    def spider_closed(self):
        print self.all_hotel_counts

    def start_requests(self):
        print len(self.urls)
        if self.urls:
            for url in self.urls:
                yield scrapy.Request(
                    url + '?checkin={};checkout={};selected_currency=USD'.format(self.checkin, self.checkout),
                    self.get_hotel, meta={'url': url})
            return
        if self.image_urls:
            yield scrapy.Request('http://booking.com', self.for_images)
            return
        if self.run == 'town':
            callback = self.dest_hotels
        if self.run == 'country':
            callback = self.dest_town
        if self.run == 'full':
            callback = self.dest_country
        rq = scrapy.Request(self.start_url, callback)
        yield rq
        return

    def for_images(self, response):
        for image in self.image_urls:
            item = BookingImageItem()
            item['image_url'] = image[0]
            item['file_path'] = image[1]
            yield item
        return

    def dest_country(self, response):
        for link in response.xpath('//a[@class="dest-sitemap__country-anchor"]/@href').extract():
            yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_town)
            # break
        return

    def dest_town(self, response):
        for link in response.xpath('//table[@class="general"]/tr/td/a[contains(@href,"/city/")]/@href').extract():
            yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_hotels)
            # break
        return

    def dest_hotels(self, response):
        #districts = response.xpath('//h3[contains(text()," Districts")]/following-sibling::table[1]//a/@href').extract()
        #for district in districts:
            #yield scrapy.Request(response.urljoin(district), self.check_districts)
        #    pass
        #if not districts or 1:
        #    city_id = re.findall(r"b_ufi : '(-?\d+)',", response.body)[0]
            # city_id = '-2960561'
            #url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(city_id)
            # yield scrapy.Request(url)
        hotels = response.xpath('//h3[contains(text(),"Hotels")]/following-sibling::table[1]//a/@href').extract()
        # print len(hotels)
        i = 1
        for hotel in hotels:
            # hotel = 'https://www.booking.com/hotel/it/eracle-volla.en-gb.html'
            url = response.urljoin(hotel.split('?')[0])
            item = Booking1Item()
            item['url'] = url
            # item['status'] = '0'
            yield item

            #yield scrapy.Request(response.urljoin(url)+'?checkin=2017-05-10;checkout=2017-05-11;selected_currency=USD',
            #                     self.get_hotel)
            i += 1
            if i > self.hotel_limits:
                break
            self.all_hotel_counts += 1
        # http://www.booking.com/searchresults.en-gb.html?nflt=ht_id%3D201&city=-2960561
        return

    def check_districts(self, response):
        district_id = response.xpath('//input[@name="district"]/@value').extract_first()
        yield scrapy.Request('http://www.booking.com/searchresults.en-gb.html?district={}'.format(district_id))
        return

    def get_list_country(self, response):
        countries = response.xpath('//h2/a/@href').extract()
        for link in countries:
            #link = '/country/nl.en-gb.html'
            print link
            country = link.split('?')[0].split('/')[-1]
            print country
            country_code = country.split('.')[0]
            #country_code = 'nl'
            url_template = 'http://www.booking.com/destinationfinder/countries{}'
            yield scrapy.Request(url_template.format(link.split('?')[0]), self.get_country, meta={'code': country_code})
            #break
        return

    def get_country(self, response):
        towns = []
        ids = response.xpath('//a[@class="dcard__button"]/@href').extract()
        towns += [id.split('city=')[-1].split(';')[0] for id in ids]
        ids = response.xpath('//div[@class="drow ddeck"]/div/div[@class="dcard_fake"]/div/@data-ufi').extract()
        towns += ids
        for town in towns:
            url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
            yield scrapy.Request(url)
            # break
        if len(towns) == 25:
            code = response.meta['code']
            url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page=2'.format(code)
            rq = scrapy.Request(url, self.get_towns, meta={'page': 2, 'code': code})
            yield rq
        return

    def get_towns(self, response):
        towns = response.xpath('//div[@class="dcard_fake"]/div/@data-ufi').extract()
        for town in towns:
            url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
            yield scrapy.Request(url)
        if len(towns) == 25:
            code = response.meta['code']
            page = response.meta['page'] + 1
            url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page={}'.format(code, page)
            rq = scrapy.Request(url, self.get_towns, meta={'page': page, 'code': code})
            yield rq

    def parse(self, response):
        #url = 'http://www.booking.com/hotel/nl/reinade-de-luxe.html'
        for row in response.xpath('//div[@class="hotellist sr_double_search"]/div/div'):
            link = row.xpath('.//a[@class="hotel_name_link url"]/@href').extract_first()
            rait_text = row.xpath('.//div[@class="reviewFloater"]/div/a[1]/span/text()[normalize-space(.)] | '
                            './/div[@class="reviewFloater"]/div/a[1]//p[contains(@class,"review-score-word")]'
                            '/text()[normalize-space(.)]').extract_first()
            hotel_type = row.xpath('.//span[@class="sr-hotel__type"]/text()').extract_first()
            if not hotel_type:
                hotel_type = u'Hotel'
            if link:
                yield Request(response.urljoin(link.split('?')[0] + '?checkin=2017-03-01;checkout=2017-03-02'),
                              self.get_hotel, meta={'rait_text': rait_text, 'hotel_type': hotel_type})
            # break
        sep = response.xpath('//span[@class="sr-separator-count"][contains(text(),"other choices")]')
        if sep:
            return
        for next_page in response.xpath('//a[@class="sr_pagination_link"]/@href').extract():
            #pass
            args = next_page.split(';')
            myargs = '?'
            for arg in args:
                if 'redirected' not in arg and ('rows' in arg or 'city' in arg or 'offset' in arg):
                    myargs += arg + ';'
            link = 'http://www.booking.com/searchresults.en-gb.html' + myargs
            yield Request(link)
        return

    def get_hotel(self, response):
        # if response.status == 404:

            # open('out.html', 'w').write(response.body)
        #stars = response.xpath('//span[@class="hp__hotel_ratings__stars"]/i/@title').extract_first()
        #if not stars:
        #    print response.xpath('//span[@class="hp__hotel_ratings__stars"]').extract()
        #st = stars.split('-')[0]
        #if not st:
        images = re.findall(r"large_url: '(.*?)'", response.body)
        if not images:
            images = response.xpath('//div[@id="photos_distinct"]/a[contains(@href,"http")]/@href').extract()
        if not images:
            item = Booking1Item()
            item['url'] = response.meta['url']
            item['status'] = '-1'
            yield item
            return
        # print response.xpath('//div[@class="room-info"]')
        #coods = response.xpath('//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
        # print coods
        #if not coods or 1:
        #    open('hotel.html', 'w').write(response.body)
        l = BookingItemLoader(item=Booking1Item(), response=response)
        l.add_xpath('name', '//h2[@id="hp_hotel_name"]/text()')
        l.add_xpath('location', '//div[@id="breadcrumb"]/div[position() > 1]/a/text() | '
                                '//div[@id="breadcrumb"]/div[last()]/text()[normalize-space(.)]')
        l.add_xpath('hotel_type', '//div[@id="viewmorealt"]/a/text()')
        l.add_xpath('rait_overall_text', '//p[contains(@class, "review-score-word")]/text() | '
                                         '//div[@id="reviewFloater"]'
                                         '//span[contains(@class, "js--hp-scorecard-scoreword")]/text() |'
                                         '//span[@class="review-score-widget__text"]/text()')
        l.add_xpath('address', '//span[@itemprop="address"]/text() | '
                               '//span[contains(@class, "hp_address_subtitle")]/text()')
        l.add_xpath('street_address', '//span[@itemprop="address"]/text() | '
                                      '//span[contains(@class, "hp_address_subtitle")]/text()')
        l.add_xpath('country', '//div[@property="itemListElement"][2]/a/text()')
        l.add_xpath('city', '//div[@id="breadcrumb"]/div[contains(@data-google-track, "city")]/a/text()')
        l.add_xpath('zip_code', '//span[@itemprop="address"]/text() | '
                                '//span[contains(@class, "hp_address_subtitle")]/text()')
        l.add_xpath('stars', '//span[contains(@class, "hp__hotel_ratings__stars")]/i/@title')
        l.add_xpath('description', '//div[@id="summary"]/p/text()')
        l.add_xpath('rating', '//div[@class="review_list_score"]/text() | '
                              '//div[@id="js--hp-gallery-scorecard"]/@data-review-score')
        l.add_xpath('no_rating', '//p[@class="review_list_score_count"]/strong/text() |'
                                 '//span[@class="review-score-widget__subtext"]/text()')
        l.add_xpath('room_types', #'//div[@class="room-info"]/a//text() | '
                                  #'//td[contains(@class,"rt__room-detail ")]/span/a/text() |'
                                  #'//table[@id="maxotel_rooms"]//tr/td[@class="ftd"]/text() | '
                                  '//a[@data-room-name-en]/@data-room-name-en')  # | '
        l.add_xpath('room_types', '//table[@id="maxotel_rooms"][contains(@class,"rt_no_dates")]//tr/td[1]/text()')
        l.add_xpath('room_totals',
                    '//table[@id="maxotel_rooms"]//tr/td[2]/span[contains(@title, "Standard occupancy")]/@title')
        l.add_xpath('room_totals_child', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
                                         '/following-sibling::tr[1]/td[1]//span[@class="invisible_spoken"]/text()')
        l.add_xpath('room_price', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
                                  '/following-sibling::tr[1]/td[2]/div/strong/text()')
        l.add_xpath('room_totals', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
                                   '/following-sibling::tr[1]/@data-occupancy')
        l.add_xpath('longitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
        l.add_xpath('latitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
        l.add_value('images', images)
        l.add_xpath('food_drink', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]'
                                  '/ul/li//text()[normalize-space(.)]')
        l.add_xpath('services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="3"]'
                                '/ul/li//text()[normalize-space(.)]')
        l.add_xpath('outdoor', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="13"]'
                               '/ul/li//text()[normalize-space(.)]')
        l.add_xpath('parking', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="16"]/ul/li/'
                               'p//text()[normalize-space(.)]')
        #l.add_xpath('general_service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/ul/li/text()')
        l.add_xpath('activities', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="2"]/'
                                  'ul/li//text()[normalize-space(.)]')
        # l.add_xpath('service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
        l.add_xpath('internet', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="11"]/ul/'
                                'li//text()[normalize-space(.)]')
        l.add_xpath('general', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/'
                               'ul/li//text()[normalize-space(.)]')
        l.add_xpath('miscellaneous', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="29"]/'
                                     'ul/li//text()[normalize-space(.)]')
        l.add_xpath('reception', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
                                 'ul/li//text()[normalize-space(.)]')
        l.add_xpath('common', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="24"]/'
                              'ul/li//text()[normalize-space(.)]')
        l.add_xpath('pool', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="21"]/'
                            'ul/li//text()[normalize-space(.)]')

        l.add_xpath('room_amenties', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="4"]/'
                                     'ul/li//text()[normalize-space(.)]')
        l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="15"]/'
                                   'ul/li//text()[normalize-space(.)]')
        l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="12"]/'
                                     'ul/li//text()[normalize-space(.)]')
        l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="17"]/'
                                     'ul/li//text()[normalize-space(.)]')
        l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="5"]/'
                                     'ul/li//text()[normalize-space(.)]')

        l.add_xpath('room_facilities', '//tr[contains(@class,"room_loop_counter1")]'
                                       '//div[contains(@class, "iconfont_wrapper")]/span/text()[normalize-space(.)] | '
                                       '//tr[contains(@class,"room_loop_counter1")]'
                                       '//div[contains(@class, "rt-all-facilities-hidden")]/span/text()')
        l.add_xpath('room_size', '//div[@class="info"][./strong[contains(text()," size:")]]/text()[normalize-space(.)]')

        l.add_xpath('included', '//div/div[contains(@class,"incExcInPriceNew")][1]/text()[normalize-space(.)]')
        l.add_xpath('not_included', '//div[contains(@class,"incExcInPriceNew")][2]/text()[normalize-space(.)]')

        #l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
        #l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
        #l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
        l.add_xpath('cleaning', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="26"]/'
                                'ul/li//text()[normalize-space(.)]')
        l.add_xpath('business', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="27"]/'
                                'ul/li//text()[normalize-space(.)]')
        l.add_xpath('media', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="6"]/'
                             'ul/li//text()[normalize-space(.)]')
        #l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
        l.add_xpath('fdesk_services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
                                      'ul/li//text()[normalize-space(.)]')
        l.add_xpath('pets', '//div[@id="hotelPoliciesInc"]/div[@class="description"][./p/span="Pets"]/p/text()')
        l.add_xpath('languages', '//div[contains(@class,"facilitiesChecklistSection")][last()]/div/ul/li/text()')
        l.add_xpath('checkin', '//div[@id="hotelPoliciesInc"]/div[@id="checkin_policy"]/p[2]/span/text()')
        l.add_xpath('checkout', '//div[@id="hotelPoliciesInc"]/div[@id="checkout_policy"]/p[2]/span/text()')
        l.add_xpath('children', '//div[@id="hotelPoliciesInc"]/div[@id="children_policy"]/p[not(@class)]//text()')
        l.add_xpath('cards', '//div[@class="description hp_bp_payment_method"]/p[2]/'
                             'button/@aria-label | //div[@class="description hp_bp_payment_method"]/p[2]/'
                             'span[@class="creditcard_noimg no-image-payment"]/text()')
        l.add_xpath('cancelation', '//div[@id="cancellation_policy"]/p[not(@class)]//text()')
        l.add_xpath('most_pop_fac', '//div[contains(@class, "hp_desc_important_facilities")]/div[@class]/'
                                    'text()[normalize-space(.)]')
        l.add_xpath('closest_landmarks', '//div[@class="hp-poi-content-section closest-landmarks"]/ul/li/'
                                         'span[1]//text()[normalize-space(.)]')
        l.add_xpath('popular_landmarks', '//div[@class="hp-poi-content-section popular-landmarks"]/ul/li/'
                                         'span[1]//text()[normalize-space(.)]')
        l.add_xpath('nearby_attractions', '//div[@class="hp_region_attractions_item "]/h4/text()')
        # print response.xpath('//p[@class="summary  hotel_meta_style"]/text()[normalize-space(.)][2]').extract()
        l.add_xpath('total_rooms', '//p[@class="summary  hotel_meta_style"]/text()[normalize-space(.)][2] |'
                                   '//p[@class="summary  hotel_meta_style"]/strong/text() |'
                                   '//p[@class="summary  hotel_meta_style"]/a/text()')
        l.add_value('url', response.meta['url'])
        #l.add_value('image_urls', images)
        #l.add_value('image_names', images)
        l.add_value('status', '1')
        yield l.load_item()
        #print response.xpath('//div[@class="checkin_policy"]/p/text()').extract()
        #if 'rait_overall_text' not in item:
        #    open('notst.html', 'w').write(response.body)
        #yield item
        # print response.xpath('//div[@id="reviewFloater"]//span[contains(@class, "js--hp-scorecard-scoreword")]/text()').extract()
        #url = 'http://www.booking.com/hotel_history_ufi?ufi=-2960561&currency=RUB&lang=en&aid=304142&sid=4c031dbed9125512ac8b2caba783d17c&stype=1&stid=304142&label=gen173nr-1DCAsowgFCEWFyYXJhdC1wYXJrLWh5YXR0SAliBW5vcmVmaMIBiAEBmAEuuAEPyAEP2AED6AEB-AEDqAID&checkin=2017-03-01&checkout=2017-03-02&_=1485632824219'
        #headers = {'X-Requested-With': 'XMLHttpRequest'}
        #yield scrapy.Request(url, self.get_score, meta={'item': item}, headers=headers)
        # langs = self.langs[:]
        # lang = langs.pop()
        # url = response.url.replace('en-gb', '{}'.format(lang))
        # rq = scrapy.Request(url, self.get_lang)
        # rq.meta['lang'] = lang
        # rq.meta['langs'] = langs
        # rq.meta['item'] = item
        # rq.meta['url'] = response.url
        # yield rq
        # return

    def get_score(self, response):
        jdata = json.loads(response.body)
        item = response.meta['item']
        item['rait_overall_text'] = jdata[0]['rait_overall_text']
        return item

    def get_lang(self, response):
        lang = response.meta['lang']
        langs = response.meta['langs']
        item = response.meta['item']
        url = response.meta['url']
        item['name_{}'.format(lang.replace('-', '_'))] = response.xpath('//h1//text()').extract_first().strip()
        item['address_{}'.format(lang.replace('-', '_'))] = \
            response.xpath('//span[@itemprop="address"]/text()').extract_first().strip()
        if not langs:
            yield item
            return
        lang = langs.pop()
        url = url.replace('en-gb', '{}'.format(lang))
        rq = scrapy.Request(url, self.get_lang)
        rq.meta['url'] = response.meta['url']
        rq.meta['lang'] = lang
        rq.meta['langs'] = langs
        rq.meta['item'] = item
        yield rq
        return


class BookingItemLoader(ItemLoader):
    def get_zip(value):
        val = re.findall(r"(\d\d\d\d.?.?.?) ", value.strip())
        # print val
        # m = re.search('\d\d\d\d.?.?.? ', value.strip())
        #print m.group()
        # val = m.group(0)
        if val:
            val = val[-1].strip()
        else:
            val = ''
        return val

    def get_child(value):
        if 'children' in value:
            return '1'
        else:
            return '0'

    def fix_rooms(self, value):
        f = 1
        rooms = value[:]
        if len(rooms) %2 == 0:
            i = 0
            while i < len(rooms):
                if rooms[i] == rooms[i + 1]:
                    del rooms[i]
                else:
                    f = 0
                    break
                i += 1
        if f:
            return '; '.join(value)
        else:
            return '; '.join(rooms)

    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(unicode.strip)

    # hotel_type_in = MapCompose()
    hotel_type_in = MapCompose(lambda value: re.findall(r'(.*?)s in |(.*)', value)[0])  # search 'atnm' in source also
    location_out = Join('; ')
    images_in = MapCompose()
    description_out = Join()
    images_out = MapCompose()
    room_types_out = fix_rooms
    room_totals_in = MapCompose(lambda value: value.split()[-1])
    room_totals_out = Join('; ')
    room_totals_child_in = MapCompose(get_child)
    room_totals_child_out = Join('; ')
    room_price_out = Join('; ')

Это большая частькода.

1 Ответ

1 голос
/ 06 октября 2019

Попробовав с вашим кодом, он переходит к обратному вызову dest_country, но там не находит никакой ссылки, поэтому просто выходит.

Может быть, проблема с выражением xpath где-то?

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...