Я написал паука, у которого есть проблемы по той или иной причине.
Я новичок в очистке, но, судя по журналу, кажется, что страница успешно загружается?Я проверил свои селекторы XPath в браузере, и они работают нормально.Я посмотрел на файл craigslist.org/robots.txt
, и он явно не запрещает то, что я делаю.
Кто-нибудь знает, что происходит?
Может быть, это как-то связано сстрока user-agent?Является ли другая версия страницы обслуживаемой пауком?
Паук
import scrapy
class RentalsCrawler(scrapy.Spider):
name = "rentals"
allowed_domains = [
'craigslist.org'
]
custom_settings = {
'DOWNLOAD_DELAY': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
}
handle_httpstatus_list = [404]
def start_requests(self):
start = 0
nopgs = 1
pages = []
for i in range(0, nopgs):
i = i * 120 + start
pages.append('https://vancouver.craigslist.ca/search/apa?s=' + str(i))
for page in pages:
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
prc_path = '//span[@class="result-meta"]/span[@class="result-price"]/text()'
sqf_path = '//span[@class="result-meta"]/span[@class="housing"]/text()'
loc_path = '//span[@class="result-meta"]/span[@class="result-hood"]/text()'
prc_resp = response.xpath(prc_path).extract_first()
sqf_resp = response.xpath(sqf_path).extract_first()
loc_resp = response.xpath(loc_path).extract_first()
objct = { 'prc': prc_resp }
if sqf_resp:
objct['sqf'] = sqf_resp
if loc_resp:
objct['loc'] = loc_resp
yield objct
Журнал
(base) C:\Users\Anthony\tutorial\tutorial\spiders>scrapy runspider rentals.py -o rentals.json
0
2018-06-07 15:58:23 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: tutorial)
2018-06-07 15:58:23 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o 27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17134-SP0
2018-06-07 15:58:23 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tutorial', 'FEED_FORMAT': 'json', 'FEED_URI': 'rentals.json', 'NEWSPIDER_MODULE': 'tutorial.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_LOADER_WARN_ONLY': True, 'SPIDER_MODULES': ['tutorial.spiders']}
2018-06-07 15:58:23 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats']
2018-06-07 15:58:23 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-06-07 15:58:23 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-06-07 15:58:23 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2018-06-07 15:58:23 [scrapy.core.engine] INFO: Spider opened
2018-06-07 15:58:23 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-06-07 15:58:23 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
0
2018-06-07 15:58:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://vancouver.craigslist.ca/robots.txt> (referer: None)
2018-06-07 15:58:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://vancouver.craigslist.ca/search/apa?s=0> (referer: None)
2018-06-07 15:58:24 [scrapy.core.engine] INFO: Closing spider (finished)
2018-06-07 15:58:24 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 468,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 36594,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 6, 7, 22, 58, 24, 237666),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2018, 6, 7, 22, 58, 23, 792075)}
2018-06-07 15:58:24 [scrapy.core.engine] INFO: Spider closed (finished)
Вывод
Пустой json
файл.
Scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tutorial.settings
[deploy]
#url = http://localhost:6800/
project = tutorial
настроек.py
# -*- coding: utf-8 -*-
# Scrapy settings for tutorial project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'tutorial.pipelines.TutorialPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Log (с `yield objct)
(base) C:\Users\Anthony\tutorial\tutorial\spiders>scrapy runspider rentals.py -o rentals.json
2018-06-07 17:33:16 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: tutorial)
2018-06-07 17:33:16 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o 27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17134-SP0
2018-06-07 17:33:16 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tutorial', 'FEED_FORMAT': 'json', 'FEED_URI': 'rentals.json', 'NEWSPIDER_MODULE': 'tutorial.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_LOADER_WARN_ONLY': True, 'SPIDER_MODULES': ['tutorial.spiders']}
2018-06-07 17:33:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats']
2018-06-07 17:33:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-06-07 17:33:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-06-07 17:33:16 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2018-06-07 17:33:16 [scrapy.core.engine] INFO: Spider opened
2018-06-07 17:33:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-06-07 17:33:16 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-06-07 17:33:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://vancouver.craigslist.ca/robots.txt> (referer: None)
2018-06-07 17:33:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://vancouver.craigslist.ca/search/apa?s=0> (referer: None)
2018-06-07 17:33:17 [scrapy.core.scraper] DEBUG: Scraped from <200 https://vancouver.craigslist.ca/search/apa?s=0>
{'prc': '$2400', 'sqf': '\n 1br -\n 895ft', 'loc': ' (North Vancouver)'}
2018-06-07 17:33:17 [scrapy.core.engine] INFO: Closing spider (finished)
2018-06-07 17:33:17 [scrapy.extensions.feedexport] INFO: Stored json feed (1 items) in: rentals.json
2018-06-07 17:33:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 468,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 37724,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 6, 8, 0, 33, 17, 36724),
'item_scraped_count': 1,
'log_count/DEBUG': 4,
'log_count/INFO': 8,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2018, 6, 8, 0, 33, 16, 533959)}
2018-06-07 17:33:17 [scrapy.core.engine] INFO: Spider closed (finished)
ЗАКЛЮЧЕНИЕ
Я наконецнаписал код, который выводит то, что я ожидал.К сожалению, когда я начал работать с XPath, скрипт объединял все цены в одном списке, квадратные метры в другом списке и местоположения в другом списке.Я предпочитаю XPath, и я уверен, что есть способ сохранить XPath, но разделять каждый листинг в диктовке.
import scrapy
class RentalsCrawler(scrapy.Spider):
name = "rentals"
allowed_domains = [
'craigslist.org',
'kajiji.ca'
]
custom_settings = {
'DOWNLOAD_DELAY': 2,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
}
handle_httpstatus_list = [404]
def start_requests(self):
start = 0
nopgs = 1
pages = []
for i in range(0, nopgs):
i = i * 120 + start
pages.append('https://vancouver.craigslist.ca/search/apa?s=' + str(i))
for page in pages:
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
for li in response.css('ul.rows li p span.result-meta'):
prc = li.css('span.result-price::text').extract_first()
sqf = li.css('span.housing::text').extract_first()
loc = li.css('result-hood::text').extract_first()
objct = { 'prc': prc }
if sqf:
objct['sqf'] = sqf
if loc:
objct['loc'] = loc
yield objct