Я знаю, что мои селекторы работают, поскольку я тестировал их с помощью оболочки scrapy. Однако, когда я использую отладчик с точкой останова в первой строке моего конвейерного кода, item
, переданный функции process_item
, равен None
, хотя я уверен, что паук получает правильные значения. Я не могу понять, что происходит, почему это происходит.
Ниже я получаю трассировку:
Traceback (most recent call last):
File "/.local/share/virtualenvs/scrapers-scrapy-JOa2pjsw/lib/python3.7/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/.local/share/virtualenvs/scrapers-scrapy-JOa2pjsw/lib/python3.7/site-packages/scrapy/utils/defer.py", line 154, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "/Documents/scrapers-scrapy/scrapers/scrapers/pipelines.py", line 260, in process_item
review_dict['car_brand_model'] = item['car_brand_model']
TypeError: 'NoneType' object is not subscriptable
Итак, ниже мой паук, предмет и конвейер:
Паук
class EdmundsSpider(Spider):
name = "edmunds"
allowed_domains = ["edmunds.com"]
custom_settings = {
'ITEM_PIPELINES': {
'scrapers.pipelines.DuplicatesPipeline': 100,
'scrapers.pipelines.EdmundsReviewsPipeline':200,
}
}
def __init__(self, *args, **kwargs):
super(EdmundsSpider, self).__init__(*args, **kwargs)
starting_url = kwargs.get('start_url')
self.start_urls = [starting_url]
def parse(self, response):
# Logging user agent just to make sure that user agents are rotating
user_agent = response.request.headers['User-Agent']
self.logger.debug(f"User Agent is {user_agent}")
# Getting the reviews from the response object.
# We will iterate over each review and fill the item
reviews = response.css(edmunds_review_selectors['reviews'])
# Getting the car brand, model and rating from the response
car_brand_model = response.css(edmunds_review_selectors['car_brand_model'])
average_user_rating = response.css(edmunds_review_selectors['average_user_rating'])
for review in reviews:
self.logger.info(f"Processing review {reviews.index(review)}...")
loader = ItemLoader(item=EdmundsReviewItem(), selector=review)
loader.add_css(field_name='review_id',
css=edmunds_review_selectors['review_id'])
loader.add_value(field_name='car_brand_model',
value=car_brand_model)
loader.add_value(field_name='average_user_rating',
value=average_user_rating)
loader.add_css(field_name='review_title',
css=edmunds_review_selectors['review_title'])
loader.add_css(field_name='review_rating',
css=edmunds_review_selectors['review_rating'])
loader.add_css(field_name='review_date',
css=edmunds_review_selectors['review_date'])
loader.add_css(field_name='reviewer_username',
css=edmunds_review_selectors['reviewer_username'])
loader.add_css(field_name='reviewed_trim',
css=edmunds_review_selectors['reviewed_trim'])
loader.add_css(field_name='helpful_votes',
css=edmunds_review_selectors['helpful_votes'])
loader.add_css(field_name='review_text',
css=edmunds_review_selectors['review_text'])
yield loader.load_item()
sleep(2)
sleep_time = randint(10,20)
self.logger.info(f"Sleeping for {sleep_time} seconds...")
sleep(sleep_time)
for a in response.css("div .ui_pagination a"):
yield response.follow(a, callback=self.parse)
Элемент
class EdmundsReviewItem(Item):
review_id = Field(
input_processor=MapCompose(lambda x: x.split("-")[1]),
output_processor=TakeFirst()
)
car_brand_model = Field(
input_processor=MapCompose(lambda x: x.get()),
output_processor=TakeFirst()
)
average_user_rating = Field(
input_processor=MapCompose(lambda x: x.get(), lambda x: float(x)),
output_processor=TakeFirst()
)
review_title = Field(
output_processor=TakeFirst()
)
review_rating = Field(
input_processor=MapCompose(lambda x: x.split(" ")[0]),
output_processor=TakeFirst()
)
review_date = Field(
output_processor=TakeThird()
)
reviewer_username = Field(
output_processor=TakeFirst()
)
reviewed_trim = Field(
output_processor=TakeFourth()
)
helpful_votes = Field(
output_processor=TakeFirst()
)
review_text = Field(
output_processor=TakeFirst()
)
Трубопровод
class EdmundsReviewsPipeline(object):
def __init__(self):
self.client = db_connect()
self.db = self.client['scrapers-tests']
self.collection = self.db['edmunds']
def process_item(self, item, spider):
review_dict = dict()
review_dict['review_id'] = item['review_id']
review_dict['car_brand_model'] = item['car_brand_model']
review_dict['average_user_rating'] = item['average_user_rating']
review_dict['review_title'] = item['review_title']
review_dict['review_rating'] = item['review_rating']
review_dict['review_date'] = item['review_date']
review_dict['reviewer_username'] = item['reviewer_username']
review_dict['reviewed_trim'] = item['reviewed_trim']
review_dict['helpful_votes'] = item['helpful_votes']
review_dict['review_text'] = item['review_text']
self.collection.insert_one(review_dict)
return item
Решено У меня есть DuplicatesPipeline
, не был реализован и вызывается первым. Поскольку он не был реализован, он возвращал None. Я реализовал это, и проблема решена.