После запуска моего паука (scrapy) для Amazon на сервере я получаю 503 Сервис недоступен - PullRequest
0 голосов
/ 21 января 2020

После развертывания на сервере моего проекта Scrapy для Amazon я получаю эту ошибку [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.amazon.com/s?k=rings&i=aps&ref=nb_sb_noss_2&&rh=> (failed 2 times): 503 Service Unavailable

При запуске проекта на моей локальной машине эта ошибка отсутствует, все работает нормально. Появляется только после загрузки на сервер. Сам проект развернут в контейнере docker.

У меня также настроена ротация прокси, но я все еще не могу понять, почему этот проект работает локально, но после запуска на сервере я получаю эту ошибку.

Пожалуйста, помогите мне решить эту проблему или скажите, что я делаю не так

 # -*- coding: utf-8 -*-

    # Scrapy settings for amazon_filters project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    import random

    from config.config import MongoConfiguration

    BOT_NAME = 'amazon'

    SPIDER_MODULES = ['amazon_filters.spiders']
    NEWSPIDER_MODULE = 'amazon_filters.spiders'

DEFAULT_REQUEST_HEADERS = {
    'Referer': 'https://www.google.com/search?q=amazon',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
}


    USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True

    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 16

    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 5
    # The download delay setting will honor only one of:
    CONCURRENT_REQUESTS_PER_DOMAIN = 1
    CONCURRENT_REQUESTS_PER_IP = 1

    # Disable cookies (enabled by default)
    COOKIES_ENABLED = True

    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False

    # Override the default request headers:


    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    SPIDER_MIDDLEWARES = {
       'amazon_filters.middlewares.AmazonFiltersDownloaderMiddleware': 543,
    }
    # RETRY_HTTP_CODES = [503, 301]

    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

    DOWNLOADER_MIDDLEWARES = {



        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
        'scrapy_rotated_proxy.downloadmiddlewares.proxy.RotatedProxyMiddleware': 750,
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
        'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
        'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401
    }

    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    # #    'scrapy.extensions.telnet.TelnetConsole': None,
    # # }

    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'amazon_filters.pipelines.MongoDbPipeline': 300,
    }

    ROTATED_PROXY_ENABLED = True
    PROXY_STORAGE = 'scrapy_rotated_proxy.extensions.file_storage.FileProxyStorage'
    # When set PROXY_FILE_PATH='', scrapy-rotated-proxy
    # will use proxy in Spider Settings default.
    PROXY_FILE_PATH = ''
    HTTP_PROXIES = [
      'http://1b9f350133:eQSR0tlB@23.229.73.155:4444'
    ]



    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 10
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 0.5
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = True


    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    # -----------------------------------------------------------------------------
    # ROTATED PROXY SETTINGS (Spider Settings Backend)
    # -----------------------------------------------------------------------------
    RANDOM_UA_PER_PROXY = True
    ROTATED_PROXY_ENABLED = True

    #
    # # When set PROXY_FILE_PATH='', scrapy-rotated-proxy
    # # will use proxy in Spider Settings default.
    # PROXY_FILE_PATH = 'proxy.txt'
    RETRY_TIMES = 5
    DOWNLOAD_TIMEOUT = 5

    # PROXY_STORAGE = 'scrapy_rotated_proxy.extensions.file_storage.FileProxyStorage'
    # HTTP_PROXIES = [
    #     'http://1b9f350133:eQSR0tlB@104.144.22.240:4444',
    #     'http://1b9f350133:eQSR0tlB@138.128.11.75:4444',
    #     'http://1b9f350133:eQSR0tlB@23.236.221.83:4444',
    #     'http://1b9f350133:eQSR0tlB@172.245.64.226:4444',
    #     'http://1b9f350133:eQSR0tlB@198.12.80.181:4444',
    #     ,
    #     'http://1b9f350133:eQSR0tlB@23.254.17.84:4444',
    #     'http://1b9f350133:eQSR0tlB@45.72.71.138:4444'
    # ]

    # TIMEZONE = 'America/Los_Angeles'
    # ROTATING_PROXY_BAN_POLICY = 'amazon_filters.policy.MyPolicy'
    # PROXY_STORAGE = 'scrapy_rotated_proxy.extensions.mongodb_storage.MongoDBProxyStorage'


    # PROXY_MONGODB_HOST = MongoConfiguration.MONGO_DB
    # PROXY_MONGODB_PORT = MongoConfiguration.MONGO_PORT
    # PROXY_MONGODB_USERNAME = MongoConfiguration.MONGO_USERNAME
    # PROXY_MONGODB_PASSWORD = MongoConfiguration.MONGO_PASSWORD
    #
    # PROXY_MONGODB_AUTH_DB = MongoConfiguration.MONGO_USERNAME
    # PROXY_MONGODB_DB = 'vps_management'
    # PROXY_MONGODB_COLL = 'service'
...