Я пробую свои силы в каком-то очень простом просмотре веб-страниц.
Сначала я проверил все свои возвращения xpath в оболочке scrapy, но при запуске их в классе моего паука я получаю следующую ошибку.
TypeError: xpath() missing 1 required positional argument: 'query'
НЕКОТОРЫЙ КОД ДЛЯ ДЕМОНСТРАЦИИ
import scrapy
from scrapy import Request #may be irrelevant
from scrapy.http import TextResponse as response #may be irrelevant
class hotpads_spider(scrapy.Spider):
name = 'hotpad_listing'
base = ["https://www.hotpads.com"]
#Get all listings that are for rent by owner
start_urls = ["https://www.hotpads.com/portland-or/for-rent-by-owner?isListedByOwner=true&lat=45.5659&lon=-122.6507&z=11"]
def parse(self, response):
links = response.xpath('//div[@class="SeoFooterLinks-link Utils-text-overflow"]//a/@href').getall()
for link in links:
absolute_url = self.base[0] + link
#print(absolute_url) #unit testing
yield scrapy.Request(absolute_url,callback=self.parse_meta)
#get the next page of results
#next_page_url = response.xpath('//a[@class="Linker PagerItem"]/@href').extract()
#next_absolute_url = response.urljoin(next_page_url)
#yield scrapy.Request(next_absolute_url, callback = self.parse)
def parse_meta(self,resposne):
#median_rent = response.xpath('//td[@class="AreaMarketInfo-table-data"]//text()').getall()[0]
#monthly_chng = response.xpath('//td[@class="AreaMarketInfo-table-data"]//text()').getall()[1]
location = response.css(" div.ExploreWrapper > div > div.Title.ExploreWrapper-title.Title-lg")[0].css("*::text").get()
#location = response.xpath('//div[@class="Title ExploreWrapper-title Title-lg"]/text()').get() ## this is just the xpath version of the previous variable for testing
yield {
'location':location
#'median_rent' : median_rent ,
#'monthly_chng':monthly_chng
}
Теперь я закомментировал достаточное количество кода, поскольку эта ошибка сохраняется во всех переменных.
Вот что я попробовал.
scrapy shell "https://www.hotpads.com/portland-or/for-rent-by-owner?isListedByOwner=true&lat=45.5659&lon=-122.6507&z=11"
response.css(" div.ExploreWrapper > div > div.Title.ExploreWrapper-title.Title-lg")[0].css("*::text").get()
>>'Portland snapshot'
Редактировать:
Вот усеченный вывод из моего сканера.
2019-04-29 12:04:29 [scrapy.core.engine] INFO: Spider opened
2019-04-29 12:04:29 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-04-29 12:04:29 [scrapy.extensions.telnet] INFO: Telnet console listening on
2019-04-29 12:04:29 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://hotpads.com/portland-or/for-rent-by-owner?isListedByOwner=true&lat=45.5659&lon=-122.6507&z=11> from <GET https://www.hotpads.com/portland-or/for-rent-by-owner?isListedByOwner=true&lat=45.5659&lon=-122.6507&z=11>
2019-04-29 12:04:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hotpads.com/portland-or/for-rent-by-owner?isListedByOwner=true&lat=45.5659&lon=-122.6507&z=11> (referer: None)
2019-04-29 12:04:35 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://hotpads.com/97203/for-rent-by-owner> from <GET https://www.hotpads.com/97203/for-rent-by-owner>
2019-04-29 12:04:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://hotpads.com/97203/for-rent-by-owner> (referer: None)
2019-04-29 12:04:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://hotpads.com/97203/for-rent-by-owner> (referer: None)
Traceback (most recent call last):
File "c:\users\zdwhi\appdata\local\programs\python\python37\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "c:\users\zdwhi\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\zdwhi\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\zdwhi\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\zdwhi\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\zdwhi\appdata\local\programs\python\python37\thinkful projects\Scraping Exercise\craigslist\craigslist\spiders\hotpad_listing.py", line 33, in parse_meta
location = response.xpath('//div[@class="Title ExploreWrapper-title Title-lg"]/text()').get()
TypeError: xpath() missing 1 required positional argument: 'query'