Я получил эту ошибку после запуска Python Scrapy. Это показывает, что эта утилита все на месте и начала ползти, но останавливается сразу перед сканированием первой страницы.
Я пытался сделать это несколько раз с различными настройками, но получил тот же результат, что и ниже.
2019-10-05 04:48:32 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: booking1)
2019-10-05 04:48:32 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.3, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 2.7.14 (v2.7.14:84471935ed, Sep 16 2017, 20:19:30) [MSC v.1500 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g 2 Nov 2017), cryptography 2.1.4, Platform Windows-10-10.0.14393
2019-10-05 04:48:32 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'booking1.spiders', 'LOG_LEVEL': 'INFO', 'SPIDER_MODULES': ['booking1.spiders'], 'BOT_NAME': 'booking1', 'COOKIES_ENABLED': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-10-05 04:48:33 [scrapy.middleware] INFO: Enabled item pipelines:
['booking1.pipelines.MoveImagesPipeline', 'booking1.pipelines.MysqlPipeline']
2019-10-05 04:48:33 [scrapy.core.engine] INFO: Spider opened
LIST
2019-10-05 04:48:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
0
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Closing spider (finished)
0
2019-10-05 04:48:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 305,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 93563,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 10, 5, 4, 48, 34, 80000),
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2019, 10, 5, 4, 48, 33, 133000)}
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Spider closed (finished)
Код Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from booking1.items import Booking1Item, BookingImageItem
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from scrapy import signals
import re
import hashlib
import json
import scrapy.dupefilters
class BookingComSpider(scrapy.Spider):
name = "booking.com"
allowed_domains = ["booking.com"]
start_urls = (
'https://www.booking.com/searchresults.en-gb.html?city=-3006514',
# 'http://www.booking.com/searchresults.en-gb.html?aid=376363®ion=1005',
)
langs = []
counts = 0
hotel_limits = 10000000000
all_hotel_counts = 0
urls = []
image_urls = []
checkin = '2019-12-10'
checkout = '2019-12-11'
def __init__(self):
self.mode = 'LIST' # LIST for hotel list, DATA for data of hotels, IMAGES for load images
self.start_url = 'https://www.booking.com/destination.en-gb.html'
#self.start_url = 'http://www.booking.com/destination/city/ae/dubai.html'
#self.start_url = 'https://www.booking.com/destination/city/nl/rotterdam.html'
self.run = 'full' # should be 'town' or 'country' or 'full' working only fo LIST and use self.start_url
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BookingComSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self):
print self.all_hotel_counts
def start_requests(self):
print len(self.urls)
if self.urls:
for url in self.urls:
yield scrapy.Request(
url + '?checkin={};checkout={};selected_currency=USD'.format(self.checkin, self.checkout),
self.get_hotel, meta={'url': url})
return
if self.image_urls:
yield scrapy.Request('http://booking.com', self.for_images)
return
if self.run == 'town':
callback = self.dest_hotels
if self.run == 'country':
callback = self.dest_town
if self.run == 'full':
callback = self.dest_country
rq = scrapy.Request(self.start_url, callback)
yield rq
return
def for_images(self, response):
for image in self.image_urls:
item = BookingImageItem()
item['image_url'] = image[0]
item['file_path'] = image[1]
yield item
return
def dest_country(self, response):
for link in response.xpath('//a[@class="dest-sitemap__country-anchor"]/@href').extract():
yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_town)
# break
return
def dest_town(self, response):
for link in response.xpath('//table[@class="general"]/tr/td/a[contains(@href,"/city/")]/@href').extract():
yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_hotels)
# break
return
def dest_hotels(self, response):
#districts = response.xpath('//h3[contains(text()," Districts")]/following-sibling::table[1]//a/@href').extract()
#for district in districts:
#yield scrapy.Request(response.urljoin(district), self.check_districts)
# pass
#if not districts or 1:
# city_id = re.findall(r"b_ufi : '(-?\d+)',", response.body)[0]
# city_id = '-2960561'
#url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(city_id)
# yield scrapy.Request(url)
hotels = response.xpath('//h3[contains(text(),"Hotels")]/following-sibling::table[1]//a/@href').extract()
# print len(hotels)
i = 1
for hotel in hotels:
# hotel = 'https://www.booking.com/hotel/it/eracle-volla.en-gb.html'
url = response.urljoin(hotel.split('?')[0])
item = Booking1Item()
item['url'] = url
# item['status'] = '0'
yield item
#yield scrapy.Request(response.urljoin(url)+'?checkin=2017-05-10;checkout=2017-05-11;selected_currency=USD',
# self.get_hotel)
i += 1
if i > self.hotel_limits:
break
self.all_hotel_counts += 1
# http://www.booking.com/searchresults.en-gb.html?nflt=ht_id%3D201&city=-2960561
return
def check_districts(self, response):
district_id = response.xpath('//input[@name="district"]/@value').extract_first()
yield scrapy.Request('http://www.booking.com/searchresults.en-gb.html?district={}'.format(district_id))
return
def get_list_country(self, response):
countries = response.xpath('//h2/a/@href').extract()
for link in countries:
#link = '/country/nl.en-gb.html'
print link
country = link.split('?')[0].split('/')[-1]
print country
country_code = country.split('.')[0]
#country_code = 'nl'
url_template = 'http://www.booking.com/destinationfinder/countries{}'
yield scrapy.Request(url_template.format(link.split('?')[0]), self.get_country, meta={'code': country_code})
#break
return
def get_country(self, response):
towns = []
ids = response.xpath('//a[@class="dcard__button"]/@href').extract()
towns += [id.split('city=')[-1].split(';')[0] for id in ids]
ids = response.xpath('//div[@class="drow ddeck"]/div/div[@class="dcard_fake"]/div/@data-ufi').extract()
towns += ids
for town in towns:
url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
yield scrapy.Request(url)
# break
if len(towns) == 25:
code = response.meta['code']
url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page=2'.format(code)
rq = scrapy.Request(url, self.get_towns, meta={'page': 2, 'code': code})
yield rq
return
def get_towns(self, response):
towns = response.xpath('//div[@class="dcard_fake"]/div/@data-ufi').extract()
for town in towns:
url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
yield scrapy.Request(url)
if len(towns) == 25:
code = response.meta['code']
page = response.meta['page'] + 1
url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page={}'.format(code, page)
rq = scrapy.Request(url, self.get_towns, meta={'page': page, 'code': code})
yield rq
def parse(self, response):
#url = 'http://www.booking.com/hotel/nl/reinade-de-luxe.html'
for row in response.xpath('//div[@class="hotellist sr_double_search"]/div/div'):
link = row.xpath('.//a[@class="hotel_name_link url"]/@href').extract_first()
rait_text = row.xpath('.//div[@class="reviewFloater"]/div/a[1]/span/text()[normalize-space(.)] | '
'.//div[@class="reviewFloater"]/div/a[1]//p[contains(@class,"review-score-word")]'
'/text()[normalize-space(.)]').extract_first()
hotel_type = row.xpath('.//span[@class="sr-hotel__type"]/text()').extract_first()
if not hotel_type:
hotel_type = u'Hotel'
if link:
yield Request(response.urljoin(link.split('?')[0] + '?checkin=2017-03-01;checkout=2017-03-02'),
self.get_hotel, meta={'rait_text': rait_text, 'hotel_type': hotel_type})
# break
sep = response.xpath('//span[@class="sr-separator-count"][contains(text(),"other choices")]')
if sep:
return
for next_page in response.xpath('//a[@class="sr_pagination_link"]/@href').extract():
#pass
args = next_page.split(';')
myargs = '?'
for arg in args:
if 'redirected' not in arg and ('rows' in arg or 'city' in arg or 'offset' in arg):
myargs += arg + ';'
link = 'http://www.booking.com/searchresults.en-gb.html' + myargs
yield Request(link)
return
def get_hotel(self, response):
# if response.status == 404:
# open('out.html', 'w').write(response.body)
#stars = response.xpath('//span[@class="hp__hotel_ratings__stars"]/i/@title').extract_first()
#if not stars:
# print response.xpath('//span[@class="hp__hotel_ratings__stars"]').extract()
#st = stars.split('-')[0]
#if not st:
images = re.findall(r"large_url: '(.*?)'", response.body)
if not images:
images = response.xpath('//div[@id="photos_distinct"]/a[contains(@href,"http")]/@href').extract()
if not images:
item = Booking1Item()
item['url'] = response.meta['url']
item['status'] = '-1'
yield item
return
# print response.xpath('//div[@class="room-info"]')
#coods = response.xpath('//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
# print coods
#if not coods or 1:
# open('hotel.html', 'w').write(response.body)
l = BookingItemLoader(item=Booking1Item(), response=response)
l.add_xpath('name', '//h2[@id="hp_hotel_name"]/text()')
l.add_xpath('location', '//div[@id="breadcrumb"]/div[position() > 1]/a/text() | '
'//div[@id="breadcrumb"]/div[last()]/text()[normalize-space(.)]')
l.add_xpath('hotel_type', '//div[@id="viewmorealt"]/a/text()')
l.add_xpath('rait_overall_text', '//p[contains(@class, "review-score-word")]/text() | '
'//div[@id="reviewFloater"]'
'//span[contains(@class, "js--hp-scorecard-scoreword")]/text() |'
'//span[@class="review-score-widget__text"]/text()')
l.add_xpath('address', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('street_address', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('country', '//div[@property="itemListElement"][2]/a/text()')
l.add_xpath('city', '//div[@id="breadcrumb"]/div[contains(@data-google-track, "city")]/a/text()')
l.add_xpath('zip_code', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('stars', '//span[contains(@class, "hp__hotel_ratings__stars")]/i/@title')
l.add_xpath('description', '//div[@id="summary"]/p/text()')
l.add_xpath('rating', '//div[@class="review_list_score"]/text() | '
'//div[@id="js--hp-gallery-scorecard"]/@data-review-score')
l.add_xpath('no_rating', '//p[@class="review_list_score_count"]/strong/text() |'
'//span[@class="review-score-widget__subtext"]/text()')
l.add_xpath('room_types', #'//div[@class="room-info"]/a//text() | '
#'//td[contains(@class,"rt__room-detail ")]/span/a/text() |'
#'//table[@id="maxotel_rooms"]//tr/td[@class="ftd"]/text() | '
'//a[@data-room-name-en]/@data-room-name-en') # | '
l.add_xpath('room_types', '//table[@id="maxotel_rooms"][contains(@class,"rt_no_dates")]//tr/td[1]/text()')
l.add_xpath('room_totals',
'//table[@id="maxotel_rooms"]//tr/td[2]/span[contains(@title, "Standard occupancy")]/@title')
l.add_xpath('room_totals_child', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/td[1]//span[@class="invisible_spoken"]/text()')
l.add_xpath('room_price', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/td[2]/div/strong/text()')
l.add_xpath('room_totals', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/@data-occupancy')
l.add_xpath('longitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
l.add_xpath('latitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
l.add_value('images', images)
l.add_xpath('food_drink', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="3"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('outdoor', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="13"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('parking', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="16"]/ul/li/'
'p//text()[normalize-space(.)]')
#l.add_xpath('general_service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/ul/li/text()')
l.add_xpath('activities', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="2"]/'
'ul/li//text()[normalize-space(.)]')
# l.add_xpath('service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('internet', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="11"]/ul/'
'li//text()[normalize-space(.)]')
l.add_xpath('general', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('miscellaneous', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="29"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('reception', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('common', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="24"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('pool', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="21"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('room_amenties', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="4"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="15"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="12"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="17"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="5"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('room_facilities', '//tr[contains(@class,"room_loop_counter1")]'
'//div[contains(@class, "iconfont_wrapper")]/span/text()[normalize-space(.)] | '
'//tr[contains(@class,"room_loop_counter1")]'
'//div[contains(@class, "rt-all-facilities-hidden")]/span/text()')
l.add_xpath('room_size', '//div[@class="info"][./strong[contains(text()," size:")]]/text()[normalize-space(.)]')
l.add_xpath('included', '//div/div[contains(@class,"incExcInPriceNew")][1]/text()[normalize-space(.)]')
l.add_xpath('not_included', '//div[contains(@class,"incExcInPriceNew")][2]/text()[normalize-space(.)]')
#l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
#l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
#l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('cleaning', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="26"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('business', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="27"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('media', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="6"]/'
'ul/li//text()[normalize-space(.)]')
#l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('fdesk_services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('pets', '//div[@id="hotelPoliciesInc"]/div[@class="description"][./p/span="Pets"]/p/text()')
l.add_xpath('languages', '//div[contains(@class,"facilitiesChecklistSection")][last()]/div/ul/li/text()')
l.add_xpath('checkin', '//div[@id="hotelPoliciesInc"]/div[@id="checkin_policy"]/p[2]/span/text()')
l.add_xpath('checkout', '//div[@id="hotelPoliciesInc"]/div[@id="checkout_policy"]/p[2]/span/text()')
l.add_xpath('children', '//div[@id="hotelPoliciesInc"]/div[@id="children_policy"]/p[not(@class)]//text()')
l.add_xpath('cards', '//div[@class="description hp_bp_payment_method"]/p[2]/'
'button/@aria-label | //div[@class="description hp_bp_payment_method"]/p[2]/'
'span[@class="creditcard_noimg no-image-payment"]/text()')
l.add_xpath('cancelation', '//div[@id="cancellation_policy"]/p[not(@class)]//text()')
l.add_xpath('most_pop_fac', '//div[contains(@class, "hp_desc_important_facilities")]/div[@class]/'
'text()[normalize-space(.)]')
l.add_xpath('closest_landmarks', '//div[@class="hp-poi-content-section closest-landmarks"]/ul/li/'
'span[1]//text()[normalize-space(.)]')
l.add_xpath('popular_landmarks', '//div[@class="hp-poi-content-section popular-landmarks"]/ul/li/'
'span[1]//text()[normalize-space(.)]')
l.add_xpath('nearby_attractions', '//div[@class="hp_region_attractions_item "]/h4/text()')
# print response.xpath('//p[@class="summary hotel_meta_style"]/text()[normalize-space(.)][2]').extract()
l.add_xpath('total_rooms', '//p[@class="summary hotel_meta_style"]/text()[normalize-space(.)][2] |'
'//p[@class="summary hotel_meta_style"]/strong/text() |'
'//p[@class="summary hotel_meta_style"]/a/text()')
l.add_value('url', response.meta['url'])
#l.add_value('image_urls', images)
#l.add_value('image_names', images)
l.add_value('status', '1')
yield l.load_item()
#print response.xpath('//div[@class="checkin_policy"]/p/text()').extract()
#if 'rait_overall_text' not in item:
# open('notst.html', 'w').write(response.body)
#yield item
# print response.xpath('//div[@id="reviewFloater"]//span[contains(@class, "js--hp-scorecard-scoreword")]/text()').extract()
#url = 'http://www.booking.com/hotel_history_ufi?ufi=-2960561¤cy=RUB&lang=en&aid=304142&sid=4c031dbed9125512ac8b2caba783d17c&stype=1&stid=304142&label=gen173nr-1DCAsowgFCEWFyYXJhdC1wYXJrLWh5YXR0SAliBW5vcmVmaMIBiAEBmAEuuAEPyAEP2AED6AEB-AEDqAID&checkin=2017-03-01&checkout=2017-03-02&_=1485632824219'
#headers = {'X-Requested-With': 'XMLHttpRequest'}
#yield scrapy.Request(url, self.get_score, meta={'item': item}, headers=headers)
# langs = self.langs[:]
# lang = langs.pop()
# url = response.url.replace('en-gb', '{}'.format(lang))
# rq = scrapy.Request(url, self.get_lang)
# rq.meta['lang'] = lang
# rq.meta['langs'] = langs
# rq.meta['item'] = item
# rq.meta['url'] = response.url
# yield rq
# return
def get_score(self, response):
jdata = json.loads(response.body)
item = response.meta['item']
item['rait_overall_text'] = jdata[0]['rait_overall_text']
return item
def get_lang(self, response):
lang = response.meta['lang']
langs = response.meta['langs']
item = response.meta['item']
url = response.meta['url']
item['name_{}'.format(lang.replace('-', '_'))] = response.xpath('//h1//text()').extract_first().strip()
item['address_{}'.format(lang.replace('-', '_'))] = \
response.xpath('//span[@itemprop="address"]/text()').extract_first().strip()
if not langs:
yield item
return
lang = langs.pop()
url = url.replace('en-gb', '{}'.format(lang))
rq = scrapy.Request(url, self.get_lang)
rq.meta['url'] = response.meta['url']
rq.meta['lang'] = lang
rq.meta['langs'] = langs
rq.meta['item'] = item
yield rq
return
class BookingItemLoader(ItemLoader):
def get_zip(value):
val = re.findall(r"(\d\d\d\d.?.?.?) ", value.strip())
# print val
# m = re.search('\d\d\d\d.?.?.? ', value.strip())
#print m.group()
# val = m.group(0)
if val:
val = val[-1].strip()
else:
val = ''
return val
def get_child(value):
if 'children' in value:
return '1'
else:
return '0'
def fix_rooms(self, value):
f = 1
rooms = value[:]
if len(rooms) %2 == 0:
i = 0
while i < len(rooms):
if rooms[i] == rooms[i + 1]:
del rooms[i]
else:
f = 0
break
i += 1
if f:
return '; '.join(value)
else:
return '; '.join(rooms)
default_output_processor = TakeFirst()
default_input_processor = MapCompose(unicode.strip)
# hotel_type_in = MapCompose()
hotel_type_in = MapCompose(lambda value: re.findall(r'(.*?)s in |(.*)', value)[0]) # search 'atnm' in source also
location_out = Join('; ')
images_in = MapCompose()
description_out = Join()
images_out = MapCompose()
room_types_out = fix_rooms
room_totals_in = MapCompose(lambda value: value.split()[-1])
room_totals_out = Join('; ')
room_totals_child_in = MapCompose(get_child)
room_totals_child_out = Join('; ')
room_price_out = Join('; ')
Это большая частькода.