Конвейер элемента сценария возвращает TypeError: объект 'NoneType' не является ошибкой с индексом - PullRequest
0 голосов
/ 26 мая 2020

Я знаю, что мои селекторы работают, поскольку я тестировал их с помощью оболочки scrapy. Однако, когда я использую отладчик с точкой останова в первой строке моего конвейерного кода, item, переданный функции process_item, равен None, хотя я уверен, что паук получает правильные значения. Я не могу понять, что происходит, почему это происходит.

Ниже я получаю трассировку:

Traceback (most recent call last):
  File "/.local/share/virtualenvs/scrapers-scrapy-JOa2pjsw/lib/python3.7/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/.local/share/virtualenvs/scrapers-scrapy-JOa2pjsw/lib/python3.7/site-packages/scrapy/utils/defer.py", line 154, in f
    return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
  File "/Documents/scrapers-scrapy/scrapers/scrapers/pipelines.py", line 260, in process_item
    review_dict['car_brand_model'] = item['car_brand_model']
TypeError: 'NoneType' object is not subscriptable

Итак, ниже мой паук, предмет и конвейер:

Паук

class EdmundsSpider(Spider):
    name = "edmunds"
    allowed_domains = ["edmunds.com"]

    custom_settings = {
        'ITEM_PIPELINES': {
            'scrapers.pipelines.DuplicatesPipeline': 100,
            'scrapers.pipelines.EdmundsReviewsPipeline':200,
        }
    }

    def __init__(self, *args, **kwargs):
        super(EdmundsSpider, self).__init__(*args, **kwargs)

        starting_url = kwargs.get('start_url')
        self.start_urls = [starting_url]

    def parse(self, response):
        # Logging user agent just to make sure that user agents are rotating
        user_agent = response.request.headers['User-Agent']
        self.logger.debug(f"User Agent is {user_agent}")

        # Getting the reviews from the response object.
        # We will iterate over each review and fill the item
        reviews = response.css(edmunds_review_selectors['reviews'])

        # Getting the car brand, model and rating from the response
        car_brand_model = response.css(edmunds_review_selectors['car_brand_model'])
        average_user_rating = response.css(edmunds_review_selectors['average_user_rating'])

        for review in reviews:
            self.logger.info(f"Processing review {reviews.index(review)}...")
            loader = ItemLoader(item=EdmundsReviewItem(), selector=review)
            loader.add_css(field_name='review_id',
                           css=edmunds_review_selectors['review_id'])
            loader.add_value(field_name='car_brand_model',
                             value=car_brand_model)
            loader.add_value(field_name='average_user_rating',
                             value=average_user_rating)
            loader.add_css(field_name='review_title',
                           css=edmunds_review_selectors['review_title'])
            loader.add_css(field_name='review_rating',
                           css=edmunds_review_selectors['review_rating'])
            loader.add_css(field_name='review_date',
                           css=edmunds_review_selectors['review_date'])
            loader.add_css(field_name='reviewer_username',
                           css=edmunds_review_selectors['reviewer_username'])
            loader.add_css(field_name='reviewed_trim',
                           css=edmunds_review_selectors['reviewed_trim'])
            loader.add_css(field_name='helpful_votes',
                           css=edmunds_review_selectors['helpful_votes'])
            loader.add_css(field_name='review_text',
                           css=edmunds_review_selectors['review_text'])
            yield loader.load_item()
            sleep(2)

        sleep_time = randint(10,20)
        self.logger.info(f"Sleeping for {sleep_time} seconds...")
        sleep(sleep_time)

        for a in response.css("div .ui_pagination a"):
            yield response.follow(a, callback=self.parse)

Элемент

class EdmundsReviewItem(Item):

    review_id = Field(
        input_processor=MapCompose(lambda x: x.split("-")[1]),
        output_processor=TakeFirst()
    )
    car_brand_model = Field(
        input_processor=MapCompose(lambda x: x.get()),
        output_processor=TakeFirst()
    )
    average_user_rating = Field(
        input_processor=MapCompose(lambda x: x.get(), lambda x: float(x)),
        output_processor=TakeFirst()
    )
    review_title = Field(
        output_processor=TakeFirst()
    )
    review_rating = Field(
        input_processor=MapCompose(lambda x: x.split(" ")[0]),
        output_processor=TakeFirst()
    )
    review_date = Field(
        output_processor=TakeThird()
    )
    reviewer_username = Field(
        output_processor=TakeFirst()
    )
    reviewed_trim = Field(
        output_processor=TakeFourth()
    )
    helpful_votes = Field(
        output_processor=TakeFirst()
    )
    review_text = Field(
        output_processor=TakeFirst()
    )

Трубопровод

class EdmundsReviewsPipeline(object):
    def __init__(self):
        self.client = db_connect()
        self.db = self.client['scrapers-tests']
        self.collection = self.db['edmunds']

    def process_item(self, item, spider):
        review_dict = dict()

        review_dict['review_id'] = item['review_id']
        review_dict['car_brand_model'] = item['car_brand_model']
        review_dict['average_user_rating'] = item['average_user_rating']
        review_dict['review_title'] = item['review_title']
        review_dict['review_rating'] = item['review_rating']
        review_dict['review_date'] = item['review_date']
        review_dict['reviewer_username'] = item['reviewer_username']
        review_dict['reviewed_trim'] = item['reviewed_trim']
        review_dict['helpful_votes'] = item['helpful_votes']
        review_dict['review_text'] = item['review_text']

        self.collection.insert_one(review_dict)

        return item

Решено У меня есть DuplicatesPipeline, не был реализован и вызывается первым. Поскольку он не был реализован, он возвращал None. Я реализовал это, и проблема решена.

...