Ниже приведено выполнение скрапа для заполнения динамод с URL-адресами в результате скрапа. Я получаю сообщение об ошибке:
AttributeError: объект 'dict' не имеет атрибута 'urljoin'
однако, и неясно почему.
##############################################
# Script: Prep storage for chemtrail #
# Author: James #
# Purpose: #
# Version: #
# #
##############################################
import boto3
import json
import scrapy
class ChemPrepSpider(scrapy.Spider):
name = "xxxxxx"
def start_requests(self):
urls = [
'https://www.xxxxxxx.com.au'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self,response):
dynamodb = boto3.resource('dynamodb', region_name='ap-southeast-2')
table = dynamodb.Table('chemTrailStorage')
category_links = response.css('li').xpath('a/@href').getall()
category_links_filtered = [x for x in category_links if 'shop-online' in x] # remove non category links
category_links_filtered = list(dict.fromkeys(category_links_filtered)) # remove duplicates
for category_link in category_links_filtered:
print('raw category -> ' + category_link)
next_category = response.urljoin(category_link) + '?size=99999'
print('DynamoDb insert for category: ' + next_category)
response = table.put_item(
Item={
'CategoryPath': next_category,
'ItemCount':"99999",
'JobStat':"NOT_STARTED",
'PickupDateTime':"NA",
'CompletionDateTime':"NA"
}
)
print('Response from put....')
print(response)