Постоянная блокировка с помощью капчи при запросе веб-сайта zillow - PullRequest
0 голосов
/ 05 августа 2020
• 1000

У кого-нибудь есть опыт удаления Zillo, или, может быть, у вас есть какие-нибудь предложения для меня.

Ниже мой код и мои настройки

scraper.py

class Zillow(scrapy.Spider):
    name = "zillow"
    allowed_domains = ["zillow.com/"]

    def start_requests(self):
        urls = [
            'https://www.zillow.com/homes/for_rent/90001_rb/',
            'https://www.zillow.com/homes/for_rent/90002_rb/',
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        body = response.xpath(
            "//ul[contains(@class, 'photo-cards_wow')]"
        )
        for card in body:
            url = card.css("a::attr(href)").extract()

            #yield scrapy.Request(url=url)
            yield {
                'url': url,
            }

settings.py

DOWNLOAD_DELAY = 35
DOWNLOAD_TIMEOUT = 25
ROBOTSTXT_OBEY = False

CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
COOKIES_ENABLED = False
TELNETCONSOLE_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
    'zillow.middlewares.ZillowDownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
    'zillow.smartproxy_auth.ProxyMiddleware': 100,
}

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en',
    'accept-encoding': 'gzip, deflate, br',
    'Referer': 'https://zillow.com/',
    'cache-control': 'max-age=0',
    'upgrade-insecure-requests': '1',
}

AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 15
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

HTTPCACHE_ENABLED = False
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
...