• 1000
У кого-нибудь есть опыт удаления Zillo, или, может быть, у вас есть какие-нибудь предложения для меня.
Ниже мой код и мои настройки
scraper.py
class Zillow(scrapy.Spider):
name = "zillow"
allowed_domains = ["zillow.com/"]
def start_requests(self):
urls = [
'https://www.zillow.com/homes/for_rent/90001_rb/',
'https://www.zillow.com/homes/for_rent/90002_rb/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
body = response.xpath(
"//ul[contains(@class, 'photo-cards_wow')]"
)
for card in body:
url = card.css("a::attr(href)").extract()
#yield scrapy.Request(url=url)
yield {
'url': url,
}
settings.py
DOWNLOAD_DELAY = 35
DOWNLOAD_TIMEOUT = 25
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
COOKIES_ENABLED = False
TELNETCONSOLE_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'zillow.middlewares.ZillowDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'zillow.smartproxy_auth.ProxyMiddleware': 100,
}
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en',
'accept-encoding': 'gzip, deflate, br',
'Referer': 'https://zillow.com/',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
}
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 15
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
HTTPCACHE_ENABLED = False
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'