Я должен включить опцию dont_filter=True
в каждый запрос моего паука, я уже использовал эту опцию, но я не знаю, почему в этот раз я получаю эту ошибку:
Unhandled Error
Traceback (most recent call last):
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\commands\crawl.py", line 58, in run
self.crawler_process.start()
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\crawler.py", line 293, in start
reactor.run(installSignalHandlers=False) # blocking call
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 1283, in run
self.mainLoop()
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 1292, in mainLoop
self.runUntilCurrent()
--- <exception caught here> ---
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 913, in runUntilCurrent
call.func(*call.args, **call.kw)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\utils\reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 135, in _next_request
self.crawl(request, spider)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 210, in crawl
self.schedule(request, spider)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\scheduler.py", line 54, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
builtins.AttributeError: 'dict' object has no attribute 'dont_filter'
Вот мой паук (извините, он довольно большой):
class communes_spider(scrapy.Spider):
name = "corrections"
firstSearchDate = datetime(2019, 8, 4)
crawlDate = firstSearchDate - timedelta(days=31)
path = 'D:/Données/Drivy/'
JSON = []
custom_settings = {
'ROBOTSTXT_OBEY' : True,
'DOWNLOAD_DELAY' : 6,
'CONCURRENT_REQUESTS' : 1,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'AUTOTHROTTLE_ENABLED' : True,
'AUTOTHROTTLE_START_DELAY' : 6,
'LOG_STDOUT' : True,
'LOG_FILE' : 'D:/Données/Drivy/' + str(datetime.date(firstSearchDate)) + '_bis' '/' + 'log_' + str(datetime.date(crawlDate)) + '_' + str(datetime.date(firstSearchDate)) + '.txt',
'FEED_FORMAT': 'json',
'FEED_URI': 'file:///D:/Données/Drivy/' + str(datetime.date(firstSearchDate)) + '_bis' + '/' + str(datetime.date(crawlDate)) + '_' + str(datetime.date(firstSearchDate)) + '.json',
}
start_urls = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}"
def start_requests(self):
with open('C:/Users/coppe/drivy/carBrands.json') as json_file:
brands = json.load(json_file)
with open(self.path + str(datetime.date(self.firstSearchDate)) + '/' + str(datetime.date(self.crawlDate)) + '_' + str(datetime.date(self.firstSearchDate)) + '.json') as json_file:
cars = json.load(json_file)
for car in cars:
if car['carBrand'] == "Citroën":
car['carBrand'] = car['carBrand'].replace('ë','e')
if car['carBrandID'] == 'other' or car['carModelID'] == 'other':
for brand in brands:
if car['carBrand'].lower() == brand['brandName'].lower():
car['carBrandID'] = brand['brandID']
for model in brand['models']:
if car['carModel'].lower() == model['modelNameFrench'].lower() or car['carModel'].lower() == model['modelNameDutch'].lower():
car['carModelID'] = model['modelID']
else:
pass
else:
pass
if car['mileageCode']=='6':
url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],5,car['carLat'],car['carLong'],car['open'])
else:
url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],car['mileageCode'],car['carLat'],car['carLong'],car['open'])
yield scrapy.Request(
url=url,
callback=self.parse_sugPrice,
meta={'car':car},
dont_filter=True,
)
elif datetime.date(datetime.strptime(car['crawlDate'],'%Y-%m-%d')).year == 2020:
if car['mileageCode']=='6':
url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],5,car['carLat'],car['carLong'],car['open'])
else:
url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],car['mileageCode'],car['carLat'],car['carLong'],car['open'])
yield scrapy.Request(
url=url,
callback=self.parse_sugPrice,
meta={'car':car},
dont_filter=True,
)
else:
yield car
def parse_sugPrice(self, response):
data = json.loads(response.text, encoding="utf8")
selector = Selector(data['html'])
eligibleObj = json.loads(selector.css('a::attr(data-estimated-earnings)').get())
openEligible = response.meta['car']['openEligible']
if response.meta['car']['carBrandID'] == 'other' or response.meta['car']['carModelID'] == 'other':
response.meta['car']['suggestedPrice'] = -1 # No estimation available
else:
if response.meta['car']['open'] == False and openEligible == True:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
correctedEstimEarnings = estimEarnings/1.25
response.meta['car']['suggestedPrice'] = correctedEstimEarnings/20 # Suggested price based current earnings condition on open decision (open is true or false)
elif response.meta['car']['open'] == True and openEligible == False:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
correctedEstimEarnings = estimEarnings*1.25
response.meta['car']['suggestedPrice'] = correctedEstimEarnings/15 # Suggested price based current earnings condition on open decision (open is true or false)
elif response.meta['car']['open'] == True and openEligible == True:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
response.meta['car']['suggestedPrice'] = estimEarnings/20 # Suggested price based current earnings condition on open decision (open is true or false)
else:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
response.meta['car']['suggestedPrice'] = estimEarnings/15 # Suggested price based current earnings condition on open decision (open is true or false)
if response.meta['car']['numEvalCar'] > 0:
if response.meta['car']['firstReviewYear'] != datetime.now().year:
estimMembership = datetime.now().year - response.meta['car']['firstReviewYear'] # in years
correctedYear = response.meta['car']['immatricYear'] + estimMembership
if response.meta['car']['mileageCode']=='6':
suggestedPriceLink = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}".format(response.meta['car']['carBrandID'],response.meta['car']['carModelID'],correctedYear,'5',response.meta['car']['carLat'],response.meta['car']['carLong'],response.meta['car']['open'])
else:
suggestedPriceLink = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}".format(response.meta['car']['carBrandID'],response.meta['car']['carModelID'],correctedYear,response.meta['car']['mileageCode'],response.meta['car']['carLat'],response.meta['car']['carLong'],response.meta['car']['open'])
yield scrapy.Request(
url=suggestedPriceLink,
callback=self.parse_correctSugPrice,
meta={'car':response.meta['car']},
dont_filter=True,
)
else:
yield response.meta['car']
else:
yield response.meta['car']
def parse_correctSugPrice(self, response):
data = json.loads(response.text, encoding="utf8")
selector = Selector(data['html'])
eligibleObj = json.loads(selector.css('a::attr(data-estimated-earnings)').get())
openEligible = response.meta['car']['openEligible']
if response.meta['car']['carBrandID'] == 'other' or response.meta['car']['carModelID'] == 'other':
response.meta['car']['correctSuggestedPrice'] = -1 # No estimation available
else:
if response.meta['car']['open'] == False and openEligible == True:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
correctedEstimEarnings = estimEarnings/1.25
response.meta['car']['correctSuggestedPrice'] = correctedEstimEarnings/20 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
elif response.meta['car']['open'] == True and openEligible == False:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
correctedEstimEarnings = estimEarnings*1.25
response.meta['car']['correctSuggestedPrice'] = correctedEstimEarnings/15 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
elif response.meta['car']['open'] == True and openEligible == True:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
response.meta['car']['correctSuggestedPrice'] = estimEarnings/20 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
else:
estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
response.meta['car']['correctSuggestedPrice'] = estimEarnings/15 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
yield response.meta['car']
Я что-то пропустил?