Я пытаюсь добавить местоположение для моих данных с другой страницы. В настоящее время я получаю только следующий ответ, ожидая, например, «Париж, Франция»
{'pk': 1100, 'name': 'Car Ralley 42 2016', 'start': '2016-11-25', **'location': <GET https://www.somepage.com/festivals/carralley-november/42-2016/>**}
spider.py
class SomeSpider(Spider):
if not DEBUG:
custom_settings = {
"DOWNLOAD_DELAY": "3",
}
handle_httpstatus_list = [404]
name = "somedomain"
_http404status_count = 0
def start_requests(self):
"""Initialize DynamoDB and get the last PK which was crawled."""
self.database = DynamoDB(self.settings, DYNAMODB_TABLE, DYNAMODB_ITEM)
self._pk = self.database.get_pk()
yield scrapy.Request(
f"https://www.somedomain.com/api-internal/v1/festivals/{self._pk}/?format=json"
)
def parse_event_location(self, response):
location = response.xpath('//li[@class="styles__c-h0jg7b-2 jhidiL"]/text()').get()
yield location
def parse(self, response):
if self._http404status_count == HTTP404STATUS_MAX:
raise CloseSpider(f"Reached {HTTP404STATUS_MAX} 404 responses in a row.")
if response.status == 404:
self._http404status_count += 1
elif response.status == 200:
self._http404status_count = 0 # Reset _http404status_count
body = json.loads(response.body)
festicket_detail_page = body.get("url")
if festicket_detail_page:
location = response.follow(
festicket_detail_page,
callback=self.parse_event_location
)
yield {
"pk": body.get("pk"),
"name": body.get("name"),
"start": body.get("start"),
"location": location
}