Для каждого из моих элементов я пытаюсь проанализировать 2 внешние ссылки и заполнить атрибуты своего элемента их данными.
Я использую рекурсивные запросы следующим образом:
Мой предмет:
class Artiste(scrapy.Item):
url = scrapy.Field()
nom = scrapy.Field()
styles = scrapy.Field()
lien_youtube = scrapy.Field()
lien_soundcloud = scrapy.Field()
youtube_followers = scrapy.Field()
soundclound_followers = scrapy.Field()
Мои классы:
class AnnuSpider(scrapy.Spider):
name = "annu"
start_urls = [
'https://www.livetonight.fr/groupe-musique-dj',
]
def parse(self, response):
for artiste in response.css('.card-musician'):
doc = Artiste()
details_partial_link = artiste.css('a::attr(href)').get()
doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
doc['url'] = details_partial_link
details_link = response.urljoin(details_partial_link)
request = scrapy.Request(details_link, callback=self.parse_details)
request.meta['item'] = doc
yield request
def parse_details(self, response):
doc = response.meta['item']
for liens in response.css('.display-flex'):
links = liens.css('a::attr(href)').get()
if links != None:
#print("youtube link", links)
link = re.search('https?://([A-Za-z_0-9.-]+).*', links)
print("link.group:", link.group(1))
if link.group(1) == "www.youtube.com":
print(links)
print("is youtube")
doc['lien_youtube'] = links
request = scrapy.Request(links, callback=self.parse_youtube)
request.meta['item'] = doc
return request
if link.group(1) == "soundcloud.com":
print("is soundclound", links)
doc['lien_soundcloud'] = links
request = scrapy.Request(links, callback=self.parse_soundcloud)
request.meta['item'] = doc
return request
doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
return doc
def parse_youtube(self, response):
doc = response.meta['item']
doc['youtube_followers'] = response.css('.yt-subscription-button-subscriber-count-branded-horizontal::text').get()
print("followers youtube", doc['youtube_followers'])
return doc
def parse_soundcloud(self, response):
doc = response.meta['item']
doc['soundclound_followers'] = response.xpath('//meta[@property="soundcloud:follower_count"]').xpath('@content').get()
print("followers soundclound", doc['soundclound_followers'])
return doc
По некоторым причинам мой вывод содержит нужное количество элементов, но анализируется и включается только YouTube, за исключением одного элемента:
[
{"url": "/groupe-musique-dj/1598-whisper-note", "nom": "Whisper Note", "lien_youtube": "https://www.youtube.com/channel/UC_oi6rwsSHvFS1t66bi1ZXw", "youtube_followers": "19"},
{"url": "/groupe-musique-dj/4109-sax-in-the-wood", "nom": "Sax in the Wood", "lien_youtube": "https://www.youtube.com/channel/UCwOCslvDxzdFWAtLXpRaiGw", "youtube_followers": "19"},
{"url": "/groupe-musique-dj/434-la-bande-originale", "nom": "La Bande Originale", "lien_youtube": "https://www.youtube.com/channel/UCkadwAQpBAdXDSfmTBlbExg", "youtube_followers": "28"},
{"url": "/groupe-musique-dj/130-octave-et-anatole", "nom": "Octave et Anatole", "lien_youtube": "https://www.youtube.com/channel/UC127U1U8b7U_HUzmfwz6W9g", "youtube_followers": "204"},
{"url": "/groupe-musique-dj/2299-poptime", "nom": "POPTIME", "lien_youtube": "https://www.youtube.com/channel/UCD_H7wwwgu9Q4DRXsRu8XIQ", "youtube_followers": null},
{"url": "/groupe-musique-dj/5897-swingin-the-pop", "nom": "Swingin' the Pop", "lien_youtube": "https://www.youtube.com/channel/UCnsK2SSxbybEiWGhnMUdOJw", "youtube_followers": "32"},
{"url": "/groupe-musique-dj/919-shiny-music", "nom": "Shiny Music", "lien_youtube": "https://www.youtube.com/channel/UCFRVA9hmuEMSL3rScw3rlQQ", "youtube_followers": "41"},
{"url": "/groupe-musique-dj/433-cover-club", "nom": "Cover Club", "lien_youtube": "https://www.youtube.com/channel/UC9BNx9smL6nx-OzvjLu9NwA", "youtube_followers": "1.14K"},
{"url": "/groupe-musique-dj/841-be-combo", "nom": "BE COMBO", "lien_youtube": "https://www.youtube.com/watch?v=OsrJsHRP8JY&list=PLhgIJgJETMHN1wl_VzziDXrs1SEOgDSY8", "youtube_followers": "102"},
{"url": "/groupe-musique-dj/470-cover-session", "nom": "Cover Session", "lien_youtube": "https://www.youtube.com/channel/UC0CXNJ87seS-zGaG-kbFkxQ", "youtube_followers": "1"},
{"url": "/groupe-musique-dj/1106-le-groupe-moon", "nom": "Le Groupe Moon", "lien_youtube": "https://www.youtube.com/channel/UC7IZa06lXVE4eB8IUZSFebg/featured?disable_polymer=1", "youtube_followers": "134"},
{"url": "/groupe-musique-dj/311-singe", "nom": "SINGE", "lien_youtube": "https://www.youtube.com/channel/UCZGgFZOQuRNiaY6I_n0ztQg?view_as=subscriber", "youtube_followers": null},
{"url": "/groupe-musique-dj/397-buskers", "nom": "Buskers", "lien_youtube": "https://www.youtube.com/channel/UC1MiUY2y0aseQHfGL8iD_Tg", "youtube_followers": "125"},
{"url": "/groupe-musique-dj/2707-kdessa", "nom": "Kdessa", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "soundclound_followers": null, "lien_soundcloud": "https://soundcloud.com/remidugue/les-ptits-trou-sgainsbourg"},
{"url": "/groupe-musique-dj/7955-pocket-music", "nom": "Pocket Music", "lien_youtube": "https://www.youtube.com/channel/UC2F7pZ_8RgUGz0TN_X6L-fw", "youtube_followers": "66"},
{"url": "/groupe-musique-dj/3716-a-rise-to-kats", "nom": "A Rise To KaTs", "lien_youtube": "https://www.youtube.com/channel/UCNz9oCThymmrrPspcSAeo8g", "youtube_followers": "401"},
{"url": "/groupe-musique-dj/3386-komodo-sale", "nom": "Komodo Sal\u00e9", "styles": "Pop / Funk / Dj"},
{"url": "/groupe-musique-dj/5345-old-john-s-radio", "nom": "Old John's Radio", "lien_youtube": "https://www.youtube.com/channel/UCifIHECM59YNNhRPu9xovAQ", "youtube_followers": "63"},
{"url": "/groupe-musique-dj/1527-john-sende", "nom": "John Sende", "lien_youtube": "https://www.youtube.com/channel/UCmUGEwh-T7BPI9zvcqhGqNA", "youtube_followers": "31"},
{"url": "/groupe-musique-dj/5098-minka", "nom": "Minka", "lien_youtube": "https://www.youtube.com/channel/UC-LUTKNkni8CiglxIfTNfRg", "youtube_followers": "30"}
]
Если я изменю свой код, чтобы не запрашивать внешние страницы, например:
def parse_details(self, response):
doc = response.meta['item']
for liens in response.css('.display-flex'):
links = liens.css('a::attr(href)').get()
if links != None:
#print("youtube link", links)
link = re.search('https?://([A-Za-z_0-9.-]+).*', links)
print("link.group:", link.group(1))
if link.group(1) == "www.youtube.com":
print(links)
print("is youtube")
doc['lien_youtube'] = links
#request = scrapy.Request(links, callback=self.parse_youtube)
#request.meta['item'] = doc
#return request
if link.group(1) == "soundcloud.com":
print("is soundclound", links)
doc['lien_soundcloud'] = links
#request = scrapy.Request(links, callback=self.parse_soundcloud)
#request.meta['item'] = doc
#return request
doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
return doc
Я получаю ожидаемый вывод элементов со ссылками на YouTube и soundcloud, если они существуют:
[
{"url": "/groupe-musique-dj/1106-le-groupe-moon", "nom": "Le Groupe Moon", "styles": "Rock / Pop / Folk", "lien_soundcloud": "https://soundcloud.com/romain-lebihan", "lien_youtube": "https://www.youtube.com/channel/UC7IZa06lXVE4eB8IUZSFebg/featured?disable_polymer=1"},
{"url": "/groupe-musique-dj/2707-kdessa", "nom": "Kdessa", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/1598-whisper-note", "nom": "Whisper Note", "styles": "Jazz / Pop", "lien_soundcloud": "https://soundcloud.com/user-799755333/sets/whisper-note-about-jazz", "lien_youtube": "https://www.youtube.com/channel/UC_oi6rwsSHvFS1t66bi1ZXw"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Musique du monde", "lien_soundcloud": "https://soundcloud.com/remidugue/les-ptits-trou-sgainsbourg"},
{"url": "/groupe-musique-dj/3386-komodo-sale", "nom": "Komodo Sal\u00e9", "styles": "Pop / Funk / Dj"},
{"url": "/groupe-musique-dj/7955-pocket-music", "nom": "Pocket Music", "styles": "Pop / Dj / Jazz", "lien_youtube": "https://www.youtube.com/channel/UC2F7pZ_8RgUGz0TN_X6L-fw"},
{"url": "/groupe-musique-dj/3716-a-rise-to-kats", "nom": "A Rise To KaTs", "styles": "Guinguette / Swing", "lien_soundcloud": "https://soundcloud.com/user-184119966", "lien_youtube": "https://www.youtube.com/channel/UCNz9oCThymmrrPspcSAeo8g"},
{"url": "/groupe-musique-dj/397-buskers", "nom": "Buskers", "styles": "Jazz / Folk / Rock", "lien_soundcloud": "https://soundcloud.com/buskers-paris", "lien_youtube": "https://www.youtube.com/channel/UC1MiUY2y0aseQHfGL8iD_Tg"},
{"url": "/groupe-musique-dj/5098-minka", "nom": "Minka", "styles": "Pop / Jazz / Funk", "lien_youtube": "https://www.youtube.com/channel/UC-LUTKNkni8CiglxIfTNfRg"},
{"url": "/groupe-musique-dj/1527-john-sende", "nom": "John Sende", "styles": "Pop / Folk / Soul", "lien_soundcloud": "https://soundcloud.com/rythme-voice", "lien_youtube": "https://www.youtube.com/channel/UCmUGEwh-T7BPI9zvcqhGqNA"},
{"url": "/groupe-musique-dj/5345-old-john-s-radio", "nom": "Old John's Radio", "styles": "Rock / Blues / Soul", "lien_youtube": "https://www.youtube.com/channel/UCifIHECM59YNNhRPu9xovAQ"},
{"url": "/groupe-musique-dj/4109-sax-in-the-wood", "nom": "Sax in the Wood", "styles": "Jazz / Pop / Saxophoniste", "lien_youtube": "https://www.youtube.com/channel/UCwOCslvDxzdFWAtLXpRaiGw"},
{"url": "/groupe-musique-dj/5897-swingin-the-pop", "nom": "Swingin' the Pop", "styles": "Jazz / Swing / Pop", "lien_youtube": "https://www.youtube.com/channel/UCnsK2SSxbybEiWGhnMUdOJw"},
{"url": "/groupe-musique-dj/2299-poptime", "nom": "POPTIME", "styles": "Pop / Rock / Funk", "lien_soundcloud": "https://soundcloud.com/welikepoptime", "lien_youtube": "https://www.youtube.com/channel/UCD_H7wwwgu9Q4DRXsRu8XIQ"},
{"url": "/groupe-musique-dj/919-shiny-music", "nom": "Shiny Music", "styles": "Pop / Rock / Jazz", "lien_soundcloud": "https://soundcloud.com/come-shine-music", "lien_youtube": "https://www.youtube.com/channel/UCFRVA9hmuEMSL3rScw3rlQQ"},
{"url": "/groupe-musique-dj/433-cover-club", "nom": "Cover Club", "styles": "Rock / Pop / Folk", "lien_youtube": "https://www.youtube.com/channel/UC9BNx9smL6nx-OzvjLu9NwA"},
{"url": "/groupe-musique-dj/130-octave-et-anatole", "nom": "Octave et Anatole", "styles": "Jazz", "lien_youtube": "https://www.youtube.com/channel/UC127U1U8b7U_HUzmfwz6W9g"},
{"url": "/groupe-musique-dj/470-cover-session", "nom": "Cover Session", "styles": "Pop / Funk / Rock", "lien_soundcloud": "https://soundcloud.com/cover-session-music", "lien_youtube": "https://www.youtube.com/channel/UC0CXNJ87seS-zGaG-kbFkxQ"},
{"url": "/groupe-musique-dj/434-la-bande-originale", "nom": "La Bande Originale", "styles": "Pop / Jazz / Funk", "lien_youtube": "https://www.youtube.com/channel/UCkadwAQpBAdXDSfmTBlbExg"},
{"url": "/groupe-musique-dj/841-be-combo", "nom": "BE COMBO", "styles": "Funk / Soul / Pop", "lien_youtube": "https://www.youtube.com/watch?v=OsrJsHRP8JY&list=PLhgIJgJETMHN1wl_VzziDXrs1SEOgDSY8"},
{"url": "/groupe-musique-dj/311-singe", "nom": "SINGE", "styles": "Rock / Pop / Funk", "lien_soundcloud": "https://soundcloud.com/singe/sets/soul-camembert", "lien_youtube": "https://www.youtube.com/channel/UCZGgFZOQuRNiaY6I_n0ztQg?view_as=subscriber"}
]
Так что, скорее всего, что-то есть с использованием return
, я пробовал использовать yield
, но либо элементы выходит из строя, или я получаю line 86
return doc
SyntaxError: 'return' with argument inside generator
Мне еще предстоит понять разницу между yield и return на практике ...