Scrapy - проблемы с опцией dont_filter для запросов - PullRequest
0 голосов
/ 31 января 2020

Я должен включить опцию dont_filter=True в каждый запрос моего паука, я уже использовал эту опцию, но я не знаю, почему в этот раз я получаю эту ошибку:

Unhandled Error
Traceback (most recent call last):
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\commands\crawl.py", line 58, in run
self.crawler_process.start()
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\crawler.py", line 293, in start
reactor.run(installSignalHandlers=False)  # blocking call
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 1283, in run
self.mainLoop()
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 1292, in mainLoop
self.runUntilCurrent()
--- <exception caught here> ---
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\twisted\internet\base.py", line 913, in runUntilCurrent
call.func(*call.args, **call.kw)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\utils\reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 135, in _next_request
self.crawl(request, spider)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 210, in crawl
self.schedule(request, spider)
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "C:\Users\coppe\Anaconda3\envs\scrapyEnv\lib\site-packages\scrapy\core\scheduler.py", line 54, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
builtins.AttributeError: 'dict' object has no attribute 'dont_filter'

Вот мой паук (извините, он довольно большой):

class communes_spider(scrapy.Spider):
name = "corrections"

firstSearchDate = datetime(2019, 8, 4)

crawlDate = firstSearchDate - timedelta(days=31)

path = 'D:/Données/Drivy/'

JSON = []

custom_settings = {
    'ROBOTSTXT_OBEY' : True,
    'DOWNLOAD_DELAY' : 6,
    'CONCURRENT_REQUESTS' : 1,
    'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    'AUTOTHROTTLE_ENABLED' : True,
    'AUTOTHROTTLE_START_DELAY' : 6,
    'LOG_STDOUT' : True,
    'LOG_FILE' : 'D:/Données/Drivy/' + str(datetime.date(firstSearchDate)) + '_bis' '/' + 'log_' + str(datetime.date(crawlDate)) + '_' + str(datetime.date(firstSearchDate)) + '.txt',
    'FEED_FORMAT': 'json',
    'FEED_URI': 'file:///D:/Données/Drivy/' + str(datetime.date(firstSearchDate)) + '_bis' + '/' + str(datetime.date(crawlDate)) + '_' + str(datetime.date(firstSearchDate)) + '.json',
}


start_urls = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}"

def start_requests(self):
    with open('C:/Users/coppe/drivy/carBrands.json') as json_file:  
        brands = json.load(json_file)
    with open(self.path + str(datetime.date(self.firstSearchDate)) + '/' + str(datetime.date(self.crawlDate)) + '_' + str(datetime.date(self.firstSearchDate)) + '.json') as json_file:  
        cars = json.load(json_file)
    for car in cars:
        if car['carBrand'] == "Citroën":
            car['carBrand'] = car['carBrand'].replace('ë','e')
        if car['carBrandID'] == 'other' or car['carModelID'] == 'other':
            for brand in brands:
                if car['carBrand'].lower() == brand['brandName'].lower():
                    car['carBrandID'] = brand['brandID']
                    for model in brand['models']:
                        if car['carModel'].lower() == model['modelNameFrench'].lower() or car['carModel'].lower() == model['modelNameDutch'].lower():
                            car['carModelID'] = model['modelID']
                        else:
                            pass
                else:
                    pass   
            if car['mileageCode']=='6':
                url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],5,car['carLat'],car['carLong'],car['open'])
            else:
                url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],car['mileageCode'],car['carLat'],car['carLong'],car['open'])
            yield scrapy.Request(
                url=url,
                callback=self.parse_sugPrice,
                meta={'car':car},
                dont_filter=True,
            )
        elif datetime.date(datetime.strptime(car['crawlDate'],'%Y-%m-%d')).year == 2020:
            if car['mileageCode']=='6':
                url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],5,car['carLat'],car['carLong'],car['open'])
            else:
                url = self.start_urls.format(car['carBrandID'],car['carModelID'],car['immatricYear'],car['mileageCode'],car['carLat'],car['carLong'],car['open'])
            yield scrapy.Request(
                url=url,
                callback=self.parse_sugPrice,
                meta={'car':car},
                dont_filter=True,
            )
        else:
            yield car

def parse_sugPrice(self, response):
    data = json.loads(response.text, encoding="utf8")
    selector = Selector(data['html'])
    eligibleObj = json.loads(selector.css('a::attr(data-estimated-earnings)').get())
    openEligible = response.meta['car']['openEligible']

    if response.meta['car']['carBrandID'] == 'other' or response.meta['car']['carModelID'] == 'other':
        response.meta['car']['suggestedPrice'] = -1 # No estimation available
    else:
        if response.meta['car']['open'] == False and openEligible == True:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            correctedEstimEarnings = estimEarnings/1.25
            response.meta['car']['suggestedPrice'] = correctedEstimEarnings/20 # Suggested price based current earnings condition on open decision (open is true or false)
        elif response.meta['car']['open'] == True and openEligible == False:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            correctedEstimEarnings = estimEarnings*1.25
            response.meta['car']['suggestedPrice'] = correctedEstimEarnings/15 # Suggested price based current earnings condition on open decision (open is true or false)
        elif response.meta['car']['open'] == True and openEligible == True: 
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            response.meta['car']['suggestedPrice'] = estimEarnings/20 # Suggested price based current earnings condition on open decision (open is true or false)
        else:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            response.meta['car']['suggestedPrice'] = estimEarnings/15 # Suggested price based current earnings condition on open decision (open is true or false)

    if response.meta['car']['numEvalCar'] > 0:
        if response.meta['car']['firstReviewYear'] != datetime.now().year:
            estimMembership = datetime.now().year - response.meta['car']['firstReviewYear'] # in years
            correctedYear = response.meta['car']['immatricYear'] + estimMembership
            if response.meta['car']['mileageCode']=='6':
                suggestedPriceLink = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}".format(response.meta['car']['carBrandID'],response.meta['car']['carModelID'],correctedYear,'5',response.meta['car']['carLat'],response.meta['car']['carLong'],response.meta['car']['open'])
            else:
                suggestedPriceLink = "https://fr.be.getaround.com/car_models/estimated_earnings?utf8=%E2%9C%93&car_model_estimation%5Bcar_brand_id%5D={}&car_model_estimation%5Bcar_model_id%5D={}&car_model_estimation%5Brelease_year%5D={}&car_model_estimation%5Bmileage%5D={}&car_model_estimation%5Blatitude%5D={}&car_model_estimation%5Blongitude%5D={}&car_model_estimation%5Bregistration_country%5D=BE&car_model_estimation%5Bwith_open_landing_multiplier%5D={}".format(response.meta['car']['carBrandID'],response.meta['car']['carModelID'],correctedYear,response.meta['car']['mileageCode'],response.meta['car']['carLat'],response.meta['car']['carLong'],response.meta['car']['open'])
            yield scrapy.Request(
                url=suggestedPriceLink,
                callback=self.parse_correctSugPrice,
                meta={'car':response.meta['car']},
                dont_filter=True,
            )
        else:
            yield response.meta['car']
    else:
        yield response.meta['car']

def parse_correctSugPrice(self, response):          
    data = json.loads(response.text, encoding="utf8")
    selector = Selector(data['html'])
    eligibleObj = json.loads(selector.css('a::attr(data-estimated-earnings)').get())
    openEligible = response.meta['car']['openEligible']

    if response.meta['car']['carBrandID'] == 'other' or response.meta['car']['carModelID'] == 'other':
        response.meta['car']['correctSuggestedPrice'] = -1 # No estimation available
    else:
        if response.meta['car']['open'] == False and openEligible == True:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            correctedEstimEarnings = estimEarnings/1.25
            response.meta['car']['correctSuggestedPrice'] = correctedEstimEarnings/20 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
        elif response.meta['car']['open'] == True and openEligible == False:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            correctedEstimEarnings = estimEarnings*1.25
            response.meta['car']['correctSuggestedPrice'] = correctedEstimEarnings/15 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
        elif response.meta['car']['open'] == True and openEligible == True: 
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            response.meta['car']['correctSuggestedPrice'] = estimEarnings/20 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.
        else:
            estimEarnings = int(re.sub("\D",'',selector.css('span.car_model_estimation_result_amount::text').get()))
            response.meta['car']['correctSuggestedPrice'] = estimEarnings/15 # Suggested price based corrected earnings condition on open decision (open is true or false) and that this decision was the same at firstReviewYear.

    yield response.meta['car']

Я что-то пропустил?

1 Ответ

0 голосов
/ 01 февраля 2020

dont_filter=True должно быть внутри scrapy.Request meta dict:
meta = {'dont_filter': True , 'car':...}

...