scrapy-spla sh выдает мне эту ошибку "HTTP-код статуса не обработан или не разрешен" - PullRequest
0 голосов
/ 31 марта 2020
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest

from ..items import Tutorial2Item

class MySpider(Spider):
    name = 'splashspider'
    start_urls = ['https://www.livescore.bet3000.com'] #FIRST LEVEL

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback = self.parse, meta ={'splash':{'endpoint':'render.js',
                  'args':{'wait':0.5,}}}                 )

    # 1. SCRAPING
    def parse(self, response):
        item = Tutorial2Item()

        for game in response.xpath("//div[@id='srlive_matchlist']"):
            item["home_team"] = game.xpath("//div[@id='srlive_matchlist']//td[contains(@class,'hometeam team home')][contains(text(),'San Marcos Arica')]").extract_first()
            item["away_team"] = game.xpath("//div[@id='srlive_matchlist']//td[contains(@class,'awayteam team away')][contains(text(),'Boston River')]").extract_first()
            yield  item

и setting.py:

# -*- coding: utf-8 -*-

# Scrapy settings for tutorial2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tutorial2'

SPIDER_MODULES = ['tutorial2.spiders']
NEWSPIDER_MODULE = 'tutorial2.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial2 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
#handle_httpstatus_list = [404]
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tutorial2.middlewares.Tutorial2SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'tutorial2.middlewares.Tutorial2DownloaderMiddleware': 543,
#}

#DOWNLOADER_MIDDLEWARES = {
 #   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
  #  'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'tutorial2.pipelines.Tutorial2Pipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

SPLASH_URL = 'http://localhost:8050'

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
#HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

SPIDER_MIDDLEWARES = {`enter code here`
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
#USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7'
#DOWNLOAD_DELAY = 0.25

Я пытался много дней, но не могу найти решение, и это дает мне эту ошибку. Либо в моем коде есть какая-то ошибка, либо есть другая проблема, которую я не могу выяснить.

(scrapy-projects) danish-khan@danishkhan-VirtualBox:~/PycharmProjects/scrapy-projects/tutorial2$ scrapy crawl splashspider
2020-03-30 16:35:19 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: tutorial2)
2020-03-30 16:35:20 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.6 (default, Jan  8 2020, 19:59:22) - [GCC 7.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-91-generic-x86_64-with-debian-stretch-sid
2020-03-30 16:35:20 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tutorial2', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'NEWSPIDER_MODULE': 'tutorial2.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['tutorial2.spiders']}
2020-03-30 16:35:20 [scrapy.extensions.telnet] INFO: Telnet Password: b43580967da382d6
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy_splash.SplashCookiesMiddleware',
 'scrapy_splash.SplashMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy_splash.SplashDeduplicateArgsMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-03-30 16:35:21 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-03-30 16:35:21 [scrapy.core.engine] INFO: Spider opened
2020-03-30 16:35:21 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-03-30 16:35:21 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://www.livescore.bet3000.com/robots.txt> (referer: None)
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET http://localhost:8050/robots.txt> (referer: None)
2020-03-30 16:35:23 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://www.livescore.bet3000.com via http://localhost:8050/render.js> (referer: None)
2020-03-30 16:35:24 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <404 https://www.livescore.bet3000.com>: HTTP status code is not handled or not allowed
2020-03-30 16:35:24 [scrapy.core.engine] INFO: Closing spider (finished)
2020-03-30 16:35:24 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 970,
 'downloader/request_count': 3,
 'downloader/request_method_count/GET': 2,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 1116,
 'downloader/response_count': 3,
 'downloader/response_status_count/404': 3,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2020, 3, 30, 11, 35, 24, 28203),
 'httperror/response_ignored_count': 1,
 'httperror/response_ignored_status_count/404': 1,
 'log_count/DEBUG': 3,
 'log_count/INFO': 10,
 'memusage/max': 54149120,
 'memusage/startup': 54149120,
 'response_received_count': 3,
 'robotstxt/request_count': 2,
 'robotstxt/response_count': 2,
 'robotstxt/response_status_count/404': 2,
 'scheduler/dequeued': 2,
 'scheduler/dequeued/memory': 2,
 'scheduler/enqueued': 2,
 'scheduler/enqueued/memory': 2,
 'splash/render.js/request_count': 1,
 'splash/render.js/response_count/404': 1,
 'start_time': datetime.datetime(2020, 3, 30, 11, 35, 21, 853911)}
2020-03-30 16:35:24 [scrapy.core.engine] INFO: Spider closed (finished)
...