Итак, я попытался применить примеры из по этой ссылке , но мой результат выглядит странно:
Мой элемент:
class Artiste(scrapy.Item):
url = scrapy.Field()
nom = scrapy.Field()
styles = scrapy.Field()
Мой класс Scrapy:
class AnnuSpider(scrapy.Spider):
name = "annu"
start_urls = [
'https://www.livetonight.fr/groupe-musique-dj',
]
def parse(self, response):
doc = Artiste()
for artiste in response.css('.card-musician'):
details_partial_link = artiste.css('a::attr(href)').get()
doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
doc['url'] = details_partial_link
details_link = response.urljoin(details_partial_link)
request = scrapy.Request(details_link, callback=self.parse_details)
request.meta['item'] = doc
print "NOM", doc['nom']
yield request
def parse_details(self, response):
doc = response.meta['item']
doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
return doc
Итак, вместо того, чтобы давать мне 21 строку, каждая из которых имеет свои собственные nom
, url
, styles
, я получаю 21 строку с одинаковыми (последняя в списке) nom
и url
и правый styles
.
Вот полный вывод:
[
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Folk / Rock"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Folk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Funk / Soul / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Folk / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Musique du monde"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Funk / Dj"}
]
Что мне странно, так это то, что если я возьму запрос, мой результат будет идеальным . Как с этим кодом:
class AnnuSpider(scrapy.Spider):
name = "annu"
start_urls = [
'https://www.livetonight.fr/groupe-musique-dj',
]
def parse(self, response):
doc = Artiste()
for artiste in response.css('.card-musician'):
details_partial_link = artiste.css('a::attr(href)').get()
doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
doc['url'] = details_partial_link
details_link = response.urljoin(details_partial_link)
yield doc