504 Исключение тайм-аута при использовании scrapy-spla sh с crawlera - PullRequest
1 голос
/ 26 мая 2020

Я пробовал scrapy-spla sh с http://www.google.com и выполнил все необходимые шаги, указанные в следующем репозитории Github https://github.com/scrapy-plugins/scrapy-splash, и мне удалось отобразить страницу Google.

Однако, когда я устал от того же http://www.google.com, интегрировав crawlera с scrapy-spla sh как указано в следующем репозитории Github https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example, я получаю 504 Исключение тайм-аута всегда

Пример URL-адреса по умолчанию http://quotes.toscrape.com/js/, упомянутый в splash_crawlera_example , успешно обрабатывается через сканер, но не через Google, есть ли что-то, что нужно изменить с помощью скрипт для рендеринга страницы Google?

вот цитаты - js .py

from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header


class QuotesJsSpider(scrapy.Spider):
    name = 'quotes-js'

    def __init__(self, *args, **kwargs):
        # to be able to load the Lua script on Scrapy Cloud, make sure your
        # project's setup.py file contains the "package_data" setting, similar
        # to this project's setup.py
        self.LUA_SOURCE = get_data(
            'splash_crawlera_example', 'scripts/crawlera.lua'
        ).decode('utf-8')
        super(QuotesJsSpider, self).__init__(*args, **kwargs)

    def start_requests(self):
        yield SplashRequest(
            # url='http://quotes.toscrape.com/js/',
            url='http://www.google.com',
            endpoint='execute',
            splash_headers={
                'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
            },
            args={
                'lua_source': self.LUA_SOURCE,
                'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
            },
            # tell Splash to cache the lua script, to avoid sending it for every request
            cache_args=['lua_source'],
        )

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = response.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield SplashRequest(
                url=response.urljoin(next_page),
                endpoint='execute',
                splash_headers={
                    'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
                },
                args={
                    'lua_source': self.LUA_SOURCE,
                    'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                },
                cache_args=['lua_source'],
            )

Settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'

SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DOWNLOADER_MIDDLEWARES = {
    'scrapy_crawlera.CrawleraMiddleware': 300,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

CRAWLERA_APIKEY = ''  # Your crawlera API key

# Splash settings
SPLASH_URL = 'http://localhost:8050/'     # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = ''  # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'


CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1

DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}

crawlera. lua

function use_crawlera(splash)
    -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
    -- Have a look at the file spiders/quotes-js.py to see how to do it.
    -- Find your Crawlera credentials in https://app.scrapinghub.com/
    local user = splash.args.crawlera_user

    local host = 'proxy.crawlera.com'
    local port = 8010
    local session_header = 'X-Crawlera-Session'
    local session_id = 'create'

    splash:on_request(function (request)
        -- The commented code below can be used to speed up the crawling
        -- process. They filter requests to undesired domains and useless
        -- resources. Uncomment the ones that make sense to your use case
        -- and add your own rules.

        -- Discard requests to advertising and tracking domains.
        -- if string.find(request.url, 'doubleclick%.net') or
        --    string.find(request.url, 'analytics%.google%.com') then
        --     request.abort()
        --     return
        -- end

        -- Avoid using Crawlera for subresources fetching to increase crawling
        -- speed. The example below avoids using Crawlera for URLS starting
        -- with 'static.' and the ones ending with '.png'.
        -- if string.find(request.url, '://static%.') ~= nil or
        --    string.find(request.url, '%.png$') ~= nil then
        --     return
        -- end

        request:set_header('X-Crawlera-Cookies', 'disable')
        request:set_header(session_header, session_id)
        request:set_proxy{host, port, username=user, password=''}
    end)

    splash:on_response_headers(function (response)
        if type(response.headers[session_header]) ~= nil then
            session_id = response.headers[session_header]
        end
    end)
end

function main(splash)
    use_crawlera(splash)
    splash:go(splash.args.url)
    return splash:html()
end
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...