Я пробовал scrapy-spla sh с http://www.google.com и выполнил все необходимые шаги, указанные в следующем репозитории Github https://github.com/scrapy-plugins/scrapy-splash, и мне удалось отобразить страницу Google.
Однако, когда я устал от того же http://www.google.com, интегрировав crawlera с scrapy-spla sh как указано в следующем репозитории Github https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example, я получаю 504 Исключение тайм-аута всегда
Пример URL-адреса по умолчанию http://quotes.toscrape.com/js/, упомянутый в splash_crawlera_example , успешно обрабатывается через сканер, но не через Google, есть ли что-то, что нужно изменить с помощью скрипт для рендеринга страницы Google?
вот цитаты - js .py
from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
class QuotesJsSpider(scrapy.Spider):
name = 'quotes-js'
def __init__(self, *args, **kwargs):
# to be able to load the Lua script on Scrapy Cloud, make sure your
# project's setup.py file contains the "package_data" setting, similar
# to this project's setup.py
self.LUA_SOURCE = get_data(
'splash_crawlera_example', 'scripts/crawlera.lua'
).decode('utf-8')
super(QuotesJsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
yield SplashRequest(
# url='http://quotes.toscrape.com/js/',
url='http://www.google.com',
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
},
# tell Splash to cache the lua script, to avoid sending it for every request
cache_args=['lua_source'],
)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next > a::attr(href)').extract_first()
if next_page:
yield SplashRequest(
url=response.urljoin(next_page),
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
Settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 300,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
CRAWLERA_APIKEY = '' # Your crawlera API key
# Splash settings
SPLASH_URL = 'http://localhost:8050/' # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1
DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}
crawlera. lua
function use_crawlera(splash)
-- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
-- Have a look at the file spiders/quotes-js.py to see how to do it.
-- Find your Crawlera credentials in https://app.scrapinghub.com/
local user = splash.args.crawlera_user
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = 'X-Crawlera-Session'
local session_id = 'create'
splash:on_request(function (request)
-- The commented code below can be used to speed up the crawling
-- process. They filter requests to undesired domains and useless
-- resources. Uncomment the ones that make sense to your use case
-- and add your own rules.
-- Discard requests to advertising and tracking domains.
-- if string.find(request.url, 'doubleclick%.net') or
-- string.find(request.url, 'analytics%.google%.com') then
-- request.abort()
-- return
-- end
-- Avoid using Crawlera for subresources fetching to increase crawling
-- speed. The example below avoids using Crawlera for URLS starting
-- with 'static.' and the ones ending with '.png'.
-- if string.find(request.url, '://static%.') ~= nil or
-- string.find(request.url, '%.png$') ~= nil then
-- return
-- end
request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=''}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
end
end)
end
function main(splash)
use_crawlera(splash)
splash:go(splash.args.url)
return splash:html()
end