Я пытаюсь очистить сайт с помощью скрапа. Иногда мой код работает, но иногда он не работает. Причина, по которой мой код не может очистить веб-страницу, заключается в том, что URL-адрес перенаправляется. Например, я пытался очистить www.example.com/page=1, который был перенаправлен в www.verify.example.com. Похоже, вы переместились в проверочную версию URL, чтобы проверить, что вы человек. Я пытался использовать прокси со многими пользовательскими агентами, но это не работает. Я думаю, что это была проблема, когда я запрашивал URL.
res = scrapy.Request(
url=the_url,
meta={
"proxy":"https://"+proxy,
},
callback=self.next_page
Прокси-сервер, который я получил с какой-то страницы. Вот код:
def get_proxies(self,empty):
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
list_ip = []
proxies = set()
for i in parser.xpath('//tbody/tr')[:100]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
#Grabbing IP and corresponding PORT
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
for loop_proxies in proxies:
list_ip.append(loop_proxies)
return list_ip
Я получил функцию get_proxies из inte rnet. И в settings.py это было так:
# -*- coding: utf-8 -*-
# Scrapy settings for example project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'example'
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'example (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
LOG_LEVEL = 'INFO'
LOG_ENABLED = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# DEFAULT_REQUEST_HEADERS = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
# 'Cookie':'s_ViewType=10; _lxsdk_s=16eb7c37081-08b-750-06a%7C%7C200; _lxsdk_cuid=16eb8d09cf5c8-05ca75fa4b8f66-14291003-100200-16eb8d09cf5c8; _lxsdk=16eb8d09cf5c8-05ca75fa4b8f66-14291003-100200-16eb8d09cf5c8; _hc.v=8f832efa-3775-aeec-1189-37c6d815ecb0.1575058712'
# 'Cookie':'ALF=1573490222; _T_WM=148616430989eb5a607186e5ccc1bd5a4754b18b318; SCF=AlnIW_LpGKigwbo6ysyWWwVvTa_owlI2qJO_J1CxkMPEGosrBTKlnFR2fkvt82OUSt8ALZaDaUYGDQ9K0TtOAK8.; SUB=_2A25wpnFhDeRhGeFN6loW9CrPyDiIHXVQaR8prDV6PUJbktBeLRahkW1NQEqapQmLSHVEFcFyeIkn1OBaYewoa96T; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEx4XGa4r.ozPRKdw4_Wmx5JpX5K-hUgL.FoM0eKnNShB0e0B2dJLoIpRLxK-LB-BL1KBLxK-LBKBLB-zLxK-LB-BL1K5peo-t; SUHB=0TPVAlkzF8-YRN; SSOLoginState=1570898225; MLOGIN=1; _T_WL=1'
# }
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'example.middlewares.DianpingScrapeSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'example.middlewares.DianpingScrapeDownloaderMiddleware': 543,
'example.middlewares.RotateUserAgentMiddleware': 543,
# 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
# 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'example.pipelines.DianpingScrapePipeline': 300,
# 'example.downloadermiddlewares.cookies.CookiesMiddleware': 700,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPERROR_ALLOWED_CODES = [403]
И я добавил в middlewares.py новую функцию для добавления агента пользователя следующим образом:
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
И я получил разные предупреждения:
twisted.internet.error.TCPTimedOutError: TCP connection timed out: 110: Connection timed out.
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.example.com/>
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
Crawled 13 pages (at 0 pages/min), scraped 0 items (at 0 items/min)