from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import Tutorial2Item
class MySpider(Spider):
name = 'splashspider'
start_urls = ['https://www.livescore.bet3000.com'] #FIRST LEVEL
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, meta ={'splash':{'endpoint':'render.js',
'args':{'wait':0.5,}}} )
# 1. SCRAPING
def parse(self, response):
item = Tutorial2Item()
for game in response.xpath("//div[@id='srlive_matchlist']"):
item["home_team"] = game.xpath("//div[@id='srlive_matchlist']//td[contains(@class,'hometeam team home')][contains(text(),'San Marcos Arica')]").extract_first()
item["away_team"] = game.xpath("//div[@id='srlive_matchlist']//td[contains(@class,'awayteam team away')][contains(text(),'Boston River')]").extract_first()
yield item
и setting.py:
# -*- coding: utf-8 -*-
# Scrapy settings for tutorial2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tutorial2'
SPIDER_MODULES = ['tutorial2.spiders']
NEWSPIDER_MODULE = 'tutorial2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
#handle_httpstatus_list = [404]
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tutorial2.middlewares.Tutorial2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tutorial2.middlewares.Tutorial2DownloaderMiddleware': 543,
#}
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'tutorial2.pipelines.Tutorial2Pipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
#HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
SPIDER_MIDDLEWARES = {`enter code here`
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
#USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7'
#DOWNLOAD_DELAY = 0.25
Я пытался много дней, но не могу найти решение, и это дает мне эту ошибку. Либо в моем коде есть какая-то ошибка, либо есть другая проблема, которую я не могу выяснить.
(scrapy-projects) danish-khan@danishkhan-VirtualBox:~/PycharmProjects/scrapy-projects/tutorial2$ scrapy crawl splashspider
2020-03-30 16:35:19 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: tutorial2)
2020-03-30 16:35:20 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.6 (default, Jan 8 2020, 19:59:22) - [GCC 7.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d 10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-91-generic-x86_64-with-debian-stretch-sid
2020-03-30 16:35:20 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tutorial2', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'NEWSPIDER_MODULE': 'tutorial2.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['tutorial2.spiders']}
2020-03-30 16:35:20 [scrapy.extensions.telnet] INFO: Telnet Password: b43580967da382d6
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy_splash.SplashCookiesMiddleware',
'scrapy_splash.SplashMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy_splash.SplashDeduplicateArgsMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-03-30 16:35:21 [scrapy.core.engine] INFO: Spider opened
2020-03-30 16:35:21 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-03-30 16:35:21 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://www.livescore.bet3000.com/robots.txt> (referer: None)
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET http://localhost:8050/robots.txt> (referer: None)
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://www.livescore.bet3000.com via http://localhost:8050/render.js> (referer: None)
2020-03-30 16:35:24 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 https://www.livescore.bet3000.com>: HTTP status code is not handled or not allowed
2020-03-30 16:35:24 [scrapy.core.engine] INFO: Closing spider (finished)
2020-03-30 16:35:24 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 970,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 2,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 1116,
'downloader/response_count': 3,
'downloader/response_status_count/404': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 3, 30, 11, 35, 24, 28203),
'httperror/response_ignored_count': 1,
'httperror/response_ignored_status_count/404': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 10,
'memusage/max': 54149120,
'memusage/startup': 54149120,
'response_received_count': 3,
'robotstxt/request_count': 2,
'robotstxt/response_count': 2,
'robotstxt/response_status_count/404': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'splash/render.js/request_count': 1,
'splash/render.js/response_count/404': 1,
'start_time': datetime.datetime(2020, 3, 30, 11, 35, 21, 853911)}
2020-03-30 16:35:24 [scrapy.core.engine] INFO: Spider closed (finished)