несколько запросов scrapy в цикле for - PullRequest
0 голосов
/ 26 мая 2020

Для каждого из моих элементов я пытаюсь проанализировать 2 внешние ссылки и заполнить атрибуты своего элемента их данными.

Я использую рекурсивные запросы следующим образом:

Мой предмет:

class Artiste(scrapy.Item):
    url = scrapy.Field()
    nom = scrapy.Field()
    styles = scrapy.Field()
    lien_youtube = scrapy.Field()
    lien_soundcloud = scrapy.Field()
    youtube_followers = scrapy.Field()
    soundclound_followers = scrapy.Field()

Мои классы:

class AnnuSpider(scrapy.Spider):
    name = "annu"
    start_urls = [
        'https://www.livetonight.fr/groupe-musique-dj',
    ]

    def parse(self, response):

        for artiste in response.css('.card-musician'):
            doc = Artiste()
            details_partial_link = artiste.css('a::attr(href)').get()
            doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
            doc['url'] = details_partial_link
            details_link = response.urljoin(details_partial_link)
            request = scrapy.Request(details_link, callback=self.parse_details)
            request.meta['item'] = doc
            yield request



    def parse_details(self, response):

        doc = response.meta['item']
        for liens in response.css('.display-flex'):

            links = liens.css('a::attr(href)').get()
            if links != None:
                #print("youtube link", links)
                link = re.search('https?://([A-Za-z_0-9.-]+).*', links)
                print("link.group:", link.group(1))
                if link.group(1) == "www.youtube.com":
                    print(links)
                    print("is youtube")
                    doc['lien_youtube'] = links
                    request = scrapy.Request(links, callback=self.parse_youtube)
                    request.meta['item'] = doc

                    return request
                if link.group(1) == "soundcloud.com":
                    print("is soundclound", links)
                    doc['lien_soundcloud'] = links
                    request = scrapy.Request(links, callback=self.parse_soundcloud)
                    request.meta['item'] = doc

                    return request
        doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
        return doc

    def parse_youtube(self, response):
        doc = response.meta['item']    
        doc['youtube_followers'] = response.css('.yt-subscription-button-subscriber-count-branded-horizontal::text').get()
        print("followers youtube", doc['youtube_followers'])
        return doc    

    def parse_soundcloud(self, response):
        doc = response.meta['item']    
        doc['soundclound_followers'] = response.xpath('//meta[@property="soundcloud:follower_count"]').xpath('@content').get()
        print("followers soundclound", doc['soundclound_followers'])
        return doc  

По некоторым причинам мой вывод содержит нужное количество элементов, но анализируется и включается только YouTube, за исключением одного элемента:

[
{"url": "/groupe-musique-dj/1598-whisper-note", "nom": "Whisper Note", "lien_youtube": "https://www.youtube.com/channel/UC_oi6rwsSHvFS1t66bi1ZXw", "youtube_followers": "19"},
{"url": "/groupe-musique-dj/4109-sax-in-the-wood", "nom": "Sax in the Wood", "lien_youtube": "https://www.youtube.com/channel/UCwOCslvDxzdFWAtLXpRaiGw", "youtube_followers": "19"},
{"url": "/groupe-musique-dj/434-la-bande-originale", "nom": "La Bande Originale", "lien_youtube": "https://www.youtube.com/channel/UCkadwAQpBAdXDSfmTBlbExg", "youtube_followers": "28"},
{"url": "/groupe-musique-dj/130-octave-et-anatole", "nom": "Octave et Anatole", "lien_youtube": "https://www.youtube.com/channel/UC127U1U8b7U_HUzmfwz6W9g", "youtube_followers": "204"},
{"url": "/groupe-musique-dj/2299-poptime", "nom": "POPTIME", "lien_youtube": "https://www.youtube.com/channel/UCD_H7wwwgu9Q4DRXsRu8XIQ", "youtube_followers": null},
{"url": "/groupe-musique-dj/5897-swingin-the-pop", "nom": "Swingin' the Pop", "lien_youtube": "https://www.youtube.com/channel/UCnsK2SSxbybEiWGhnMUdOJw", "youtube_followers": "32"},
{"url": "/groupe-musique-dj/919-shiny-music", "nom": "Shiny Music", "lien_youtube": "https://www.youtube.com/channel/UCFRVA9hmuEMSL3rScw3rlQQ", "youtube_followers": "41"},
{"url": "/groupe-musique-dj/433-cover-club", "nom": "Cover Club", "lien_youtube": "https://www.youtube.com/channel/UC9BNx9smL6nx-OzvjLu9NwA", "youtube_followers": "1.14K"},
{"url": "/groupe-musique-dj/841-be-combo", "nom": "BE COMBO", "lien_youtube": "https://www.youtube.com/watch?v=OsrJsHRP8JY&list=PLhgIJgJETMHN1wl_VzziDXrs1SEOgDSY8", "youtube_followers": "102"},
{"url": "/groupe-musique-dj/470-cover-session", "nom": "Cover Session", "lien_youtube": "https://www.youtube.com/channel/UC0CXNJ87seS-zGaG-kbFkxQ", "youtube_followers": "1"},
{"url": "/groupe-musique-dj/1106-le-groupe-moon", "nom": "Le Groupe Moon", "lien_youtube": "https://www.youtube.com/channel/UC7IZa06lXVE4eB8IUZSFebg/featured?disable_polymer=1", "youtube_followers": "134"},
{"url": "/groupe-musique-dj/311-singe", "nom": "SINGE", "lien_youtube": "https://www.youtube.com/channel/UCZGgFZOQuRNiaY6I_n0ztQg?view_as=subscriber", "youtube_followers": null},
{"url": "/groupe-musique-dj/397-buskers", "nom": "Buskers", "lien_youtube": "https://www.youtube.com/channel/UC1MiUY2y0aseQHfGL8iD_Tg", "youtube_followers": "125"},
{"url": "/groupe-musique-dj/2707-kdessa", "nom": "Kdessa", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "soundclound_followers": null, "lien_soundcloud": "https://soundcloud.com/remidugue/les-ptits-trou-sgainsbourg"},
{"url": "/groupe-musique-dj/7955-pocket-music", "nom": "Pocket Music", "lien_youtube": "https://www.youtube.com/channel/UC2F7pZ_8RgUGz0TN_X6L-fw", "youtube_followers": "66"},
{"url": "/groupe-musique-dj/3716-a-rise-to-kats", "nom": "A Rise To KaTs", "lien_youtube": "https://www.youtube.com/channel/UCNz9oCThymmrrPspcSAeo8g", "youtube_followers": "401"},
{"url": "/groupe-musique-dj/3386-komodo-sale", "nom": "Komodo Sal\u00e9", "styles": "Pop / Funk / Dj"},
{"url": "/groupe-musique-dj/5345-old-john-s-radio", "nom": "Old John's Radio", "lien_youtube": "https://www.youtube.com/channel/UCifIHECM59YNNhRPu9xovAQ", "youtube_followers": "63"},
{"url": "/groupe-musique-dj/1527-john-sende", "nom": "John Sende", "lien_youtube": "https://www.youtube.com/channel/UCmUGEwh-T7BPI9zvcqhGqNA", "youtube_followers": "31"},
{"url": "/groupe-musique-dj/5098-minka", "nom": "Minka", "lien_youtube": "https://www.youtube.com/channel/UC-LUTKNkni8CiglxIfTNfRg", "youtube_followers": "30"}
]

Если я изменю свой код, чтобы не запрашивать внешние страницы, например:

    def parse_details(self, response):

        doc = response.meta['item']
        for liens in response.css('.display-flex'):

            links = liens.css('a::attr(href)').get()
            if links != None:
                #print("youtube link", links)
                link = re.search('https?://([A-Za-z_0-9.-]+).*', links)
                print("link.group:", link.group(1))
                if link.group(1) == "www.youtube.com":
                    print(links)
                    print("is youtube")
                    doc['lien_youtube'] = links
                    #request = scrapy.Request(links, callback=self.parse_youtube)
                    #request.meta['item'] = doc

                    #return request
                if link.group(1) == "soundcloud.com":
                    print("is soundclound", links)
                    doc['lien_soundcloud'] = links
                    #request = scrapy.Request(links, callback=self.parse_soundcloud)
                    #request.meta['item'] = doc

                    #return request
        doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
        return doc

Я получаю ожидаемый вывод элементов со ссылками на YouTube и soundcloud, если они существуют:

[
{"url": "/groupe-musique-dj/1106-le-groupe-moon", "nom": "Le Groupe Moon", "styles": "Rock / Pop / Folk", "lien_soundcloud": "https://soundcloud.com/romain-lebihan", "lien_youtube": "https://www.youtube.com/channel/UC7IZa06lXVE4eB8IUZSFebg/featured?disable_polymer=1"},
{"url": "/groupe-musique-dj/2707-kdessa", "nom": "Kdessa", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/1598-whisper-note", "nom": "Whisper Note", "styles": "Jazz / Pop", "lien_soundcloud": "https://soundcloud.com/user-799755333/sets/whisper-note-about-jazz", "lien_youtube": "https://www.youtube.com/channel/UC_oi6rwsSHvFS1t66bi1ZXw"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Musique du monde", "lien_soundcloud": "https://soundcloud.com/remidugue/les-ptits-trou-sgainsbourg"},
{"url": "/groupe-musique-dj/3386-komodo-sale", "nom": "Komodo Sal\u00e9", "styles": "Pop / Funk / Dj"},
{"url": "/groupe-musique-dj/7955-pocket-music", "nom": "Pocket Music", "styles": "Pop / Dj / Jazz", "lien_youtube": "https://www.youtube.com/channel/UC2F7pZ_8RgUGz0TN_X6L-fw"},
{"url": "/groupe-musique-dj/3716-a-rise-to-kats", "nom": "A Rise To KaTs", "styles": "Guinguette / Swing", "lien_soundcloud": "https://soundcloud.com/user-184119966", "lien_youtube": "https://www.youtube.com/channel/UCNz9oCThymmrrPspcSAeo8g"},
{"url": "/groupe-musique-dj/397-buskers", "nom": "Buskers", "styles": "Jazz / Folk / Rock", "lien_soundcloud": "https://soundcloud.com/buskers-paris", "lien_youtube": "https://www.youtube.com/channel/UC1MiUY2y0aseQHfGL8iD_Tg"},
{"url": "/groupe-musique-dj/5098-minka", "nom": "Minka", "styles": "Pop / Jazz / Funk", "lien_youtube": "https://www.youtube.com/channel/UC-LUTKNkni8CiglxIfTNfRg"},
{"url": "/groupe-musique-dj/1527-john-sende", "nom": "John Sende", "styles": "Pop / Folk / Soul", "lien_soundcloud": "https://soundcloud.com/rythme-voice", "lien_youtube": "https://www.youtube.com/channel/UCmUGEwh-T7BPI9zvcqhGqNA"},
{"url": "/groupe-musique-dj/5345-old-john-s-radio", "nom": "Old John's Radio", "styles": "Rock / Blues / Soul", "lien_youtube": "https://www.youtube.com/channel/UCifIHECM59YNNhRPu9xovAQ"},
{"url": "/groupe-musique-dj/4109-sax-in-the-wood", "nom": "Sax in the Wood", "styles": "Jazz / Pop / Saxophoniste", "lien_youtube": "https://www.youtube.com/channel/UCwOCslvDxzdFWAtLXpRaiGw"},
{"url": "/groupe-musique-dj/5897-swingin-the-pop", "nom": "Swingin' the Pop", "styles": "Jazz / Swing / Pop", "lien_youtube": "https://www.youtube.com/channel/UCnsK2SSxbybEiWGhnMUdOJw"},
{"url": "/groupe-musique-dj/2299-poptime", "nom": "POPTIME", "styles": "Pop / Rock / Funk", "lien_soundcloud": "https://soundcloud.com/welikepoptime", "lien_youtube": "https://www.youtube.com/channel/UCD_H7wwwgu9Q4DRXsRu8XIQ"},
{"url": "/groupe-musique-dj/919-shiny-music", "nom": "Shiny Music", "styles": "Pop / Rock / Jazz", "lien_soundcloud": "https://soundcloud.com/come-shine-music", "lien_youtube": "https://www.youtube.com/channel/UCFRVA9hmuEMSL3rScw3rlQQ"},
{"url": "/groupe-musique-dj/433-cover-club", "nom": "Cover Club", "styles": "Rock / Pop / Folk", "lien_youtube": "https://www.youtube.com/channel/UC9BNx9smL6nx-OzvjLu9NwA"},
{"url": "/groupe-musique-dj/130-octave-et-anatole", "nom": "Octave et Anatole", "styles": "Jazz", "lien_youtube": "https://www.youtube.com/channel/UC127U1U8b7U_HUzmfwz6W9g"},
{"url": "/groupe-musique-dj/470-cover-session", "nom": "Cover Session", "styles": "Pop / Funk / Rock", "lien_soundcloud": "https://soundcloud.com/cover-session-music", "lien_youtube": "https://www.youtube.com/channel/UC0CXNJ87seS-zGaG-kbFkxQ"},
{"url": "/groupe-musique-dj/434-la-bande-originale", "nom": "La Bande Originale", "styles": "Pop / Jazz / Funk", "lien_youtube": "https://www.youtube.com/channel/UCkadwAQpBAdXDSfmTBlbExg"},
{"url": "/groupe-musique-dj/841-be-combo", "nom": "BE COMBO", "styles": "Funk / Soul / Pop", "lien_youtube": "https://www.youtube.com/watch?v=OsrJsHRP8JY&list=PLhgIJgJETMHN1wl_VzziDXrs1SEOgDSY8"},
{"url": "/groupe-musique-dj/311-singe", "nom": "SINGE", "styles": "Rock / Pop / Funk", "lien_soundcloud": "https://soundcloud.com/singe/sets/soul-camembert", "lien_youtube": "https://www.youtube.com/channel/UCZGgFZOQuRNiaY6I_n0ztQg?view_as=subscriber"}
]

Так что, скорее всего, что-то есть с использованием return, я пробовал использовать yield, но либо элементы выходит из строя, или я получаю line 86 return doc SyntaxError: 'return' with argument inside generator

Мне еще предстоит понять разницу между yield и return на практике ...

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...