Scrapy и селен - работает при использовании одного селена, но не с scrapy - PullRequest
0 голосов
/ 12 июля 2020

Я практикую Scrapy и Selenium на Booking.com.

По какой-то причине использование следующего кода Selenium работает нормально.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which

# chrome_options = Options()
# chrome_options.add_argument("--headless")

chrome_path = which("chromedriver")

driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0")

cards = driver.find_elements_by_xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

for c in cards:

     print('name', c.find_element_by_xpath('./div/div/div/div/h3/a/span[1]').text)

Приведенный выше код возвращает имена properties.

Но когда я пытаюсь использовать Selenium вместе со Scrapy, это терпит неудачу.

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy_selenium import SeleniumRequest


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['booking.com']

    def start_requests(self):
        yield SeleniumRequest(url='https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0',
        wait_time=5,
        callback=self.parse)

    def parse(self, response):

        cards = response.xpath('//div[@class="sr_item  sr_item_new sr_item_default sr_property_block  sr_flex_layout          "]')

        for c in cards:

            yield{'name': c.xpath('./div/div/div/div/h3/a/span[1]/text()').get()}

Я действительно не мог этого понять, потому что единственное отличие, которое я делаю, - это использование SeleniumRequest , если у кого-то есть идеи, было бы здорово ...

Спасибо

1 Ответ

1 голос
/ 12 июля 2020

Это сработало для меня, я предполагаю, что это может быть связано с вашим XPATH в переменной карты. Слишком много пробелов и c. Я обычно использую contain(@class,'ATTRIBUTE') для больших атрибутов, которые хочу выбрать.

Код

class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['booking.com']
    start_urls = ['https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ&lang=en-gb&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKPtaX4BcACAdICJDE5OWNlMDNkLWJjZTgtNGQyNC04ZjYwLTZiOTZmOTllYTZlN9gCBOACAQ%3Btmpl%3Dsearchresults%3Bac_click_type%3Db%3Bac_position%3D0%3Bcheckin_month%3D7%3Bcheckin_monthday%3D13%3Bcheckin_year%3D2020%3Bcheckout_month%3D7%3Bcheckout_monthday%3D14%3Bcheckout_year%3D2020%3Bcity%3D900039038%3Bclass_interval%3D1%3Bdest_id%3D1855490%3Bdest_type%3Dhotel%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D1%3Bgroup_children%3D0%3Bhighlighted_hotels%3D1855490%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dhotel%3Broom1%3DA%3Bsb_price_type%3Dtotal%3Bsearch_selected%3D1%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dsearchresults%3Bsrc_elem%3Dsb%3Bsrpvid%3Da53e49abc8b20230%3Bss%3DKiwi%2520As%2520Guest%2520House%252C%2520Rotorua%252C%2520Bay%2520of%2520Plenty%252C%2520New%2520Zealand%3Bss_all%3D0%3Bss_raw%3Dkiwias%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DRotorua%3Bssne_untouched%3DRotorua%3Btop_ufis%3D1%26%3B&ss=Rainbow+Holiday+House%2C+Rotorua%2C+Bay+of+Plenty%2C+New+Zealand&is_ski_area=&ssne=Rotorua&ssne_untouched=Rotorua&city=900039038&checkin_year=2020&checkin_month=7&checkin_monthday=13&checkout_year=2020&checkout_month=7&checkout_monthday=14&group_adults=1&group_children=0&no_rooms=1&from_sf=1&ss_raw=rainbow+holiday+house&ac_position=0&ac_langcode=en&ac_click_type=b&dest_id=5288382&dest_type=hotel&place_id_lat=-38.105882&place_id_lon=176.239504&search_pageview_id=a53e49abc8b20230&search_selected=true&search_pageview_id=a53e49abc8b20230&ac_suggestion_list_length=3&ac_suggestion_theme_list_length=0']


    def start_requests(self):
        yield SeleniumRequest(url=self.start_urls[0], wait_time=5,callback=self.parse)

    def parse(self, response):
        cards = response.xpath('//div[contains(@class,"sr_item  s")]')

        for c in cards:
            title = c.xpath('.//span[contains(@class,"hotel")]/text()').get()
            yield{'name': title.strip() }
...