Как получить данные из функции в скрапе - PullRequest
0 голосов
/ 06 апреля 2020

Я использую Scrapy для сканирования сайта. В коде я использую более одной функции обратного вызова, где данные, относящиеся к одному результату поиска, будут получены в двух функциях обратного вызова. Например,

class PubmedProjSpider(CrawlSpider):

    name = str(CONFIG.get('project_name', 'project_name'))

    start_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=(((((((((((((((((((((((((sodium%20oxybate%5BText%20Word%5D)%20OR%20(Xyrem%5BText%20Word%5D))%20OR%20(certolizumab%20pegol%5BText%20Word%5D))%20OR%20(Cimzia%5BText%20Word%5D))%20OR%20(vancomycin%20hydrochloride%5BText%20Word%5D))%20OR%20(Vancomycin%5BText%20Word%5D))%20OR%20(Vancocin%5BText%20Word%5D))%20OR%20(atorvastatin%20calcium%20trihydrate%5BText%20Word%5D))%20OR%20(atorvastatin%5BText%20Word%5D))%20OR%20(Lipitor))%20OR%20(alprostadil%5BText%20Word%5D))%20OR%20(Caverject%5BText%20Word%5D))%20OR%20(atenolol%5BText%20Word%5D))%20OR%20(Tenormin%5BText%20Word%5D))%20OR%20(tramadol%20hydrochloride%5BText%20Word%5D))%20OR%20(tramadol%5BText%20Word%5D))%20OR%20(Maneo%5BText%20Word%5D))%20OR%20(temazepam%5BText%20Word%5D))%20OR%20(citalopram%20hydrobromide%5BText%20Word%5D))%20OR%20(citalopram%5BText%20Word%5D))%20OR%20(Cipramil%5BText%20Word%5D))%20OR%20(fluticasone%20propionate%5BText%20Word%5D))%20OR%20(fluticasone%5BText%20Word%5D))%20OR%20(Cutivate%5BText%20Word%5D)))%20AND%20((%222020%2F03%2F03%22%5BDate%20-%20Create%5D%20%3A%20%222020%2F03%2F05%22%5BDate%20-%20Create%5D))&filter=simsearch2.ffrft&pos=6']
    path = r"C:\Users\vighnesh.paramasivam\Documents\pubmed_organised_copy\pubmed_organised\pubmed\pubmed\output_pdf_files"

    def __init__(self):


        self.file_storage_location = CONFIG.get('storage_location', 'text_storage_destination')


    def parse(self, response):
        try:
            hxs = Selector(response)
            items = []
            titles = hxs.xpath("//div[@class='docsum-wrap']//div[@class='docsum-content']")
            items.append(titles)

            for title in items:
                for href in title.xpath("a/@href").extract():

                    yield Request(
                        url=response.urljoin(href),
                        callback=self.parse_article
                    )

                    if response.xpath("//button[@class='load-button next-page']"):
                        temp_url = response.xpath("//div[@data-next-page-url]/@data-next-page-url").getall()[0]
                        next_page_url = response.urljoin(temp_url)
                        next_page_url = next_page_url.replace('/more','')
                        yield Request(
                                url = next_page_url,
                                callback=self.parse)
        except Exception as message:
            #print("###### exception from parse method")
            raise CloseSpider(message)
    def parse_article(self, response):
        try:
            w={}
            w['title'] = str(' '.join(response.xpath('.//h1[@class="heading-title"]')[0].xpath(".//text()").getall()).encode('utf-8').lstrip().rstrip())
            w['url'] = str(response).split(' ')[-1].strip('>')
            w['pmcid'] = str(response.xpath(".//ul/li/span[@class='identifier pubmed']/strong[@title='PubMed ID']/text()").getall()[0])
            w['authors'] = response.xpath('//div[@class="inline-authors"]/div[@class="authors"]/div[@class="authors-list"]/span/a/text()').getall()
            abstract = {'Free-Text':[]}
            w['pdf_downloaded'] = 'No'
            w['pdf_links'] = ''
            q = response.xpath("//div[@class='abstract'][@id='abstract']").getall()

            if response.xpath("//div[@class='full-text-links-list']/a/@href"):
                w['pdf_links'] = list(set(response.xpath("//div[@class='full-text-links-list']/a/@href").getall()))


            if q:
                for i in response.xpath("//div[@class='abstract'][@id='abstract']/div[@class='abstract-content selected']/p"):
                    strong_format = i.xpath("./strong//text()").getall()
                    bold_format = i.xpath("./b/text()").getall()
                    if strong_format:
                        abstract[i.xpath("./strong//text()").getall()[0].strip().strip(':').lstrip()] = ' '.join(i.xpath("./text()").getall()).lstrip().rstrip()
                    elif bold_format:
                        headings = response.xpath("//div[@class='abstract'][@id='abstract']/div[@class='abstract-content selected']/p/b/text()").getall()
                        if headings:
                            if response.xpath('normalize-space(substring-before(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall():
                                abstract['Free-Text'] = response.xpath('normalize-space(substring-before(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall()[0]
                        for num, header in enumerate(headings):
                            if num != len(headings)-1:
                                 abstract[header] = response.xpath('normalize-space(substring-before(substring-after(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]),//b[.="{}"]))'.format(headings[num], headings[num+1])).getall()[0]
                            else:
                                abstract[header] = response.xpath('normalize-space(substring-after(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[num])).getall()[0]
                    else:
                        abstract['Free-Text'].append((' '.join(i.xpath(".//text()").getall()).lstrip().rstrip()))

                if response.xpath("//div[@class='abstract'][@id='abstract']/p/strong[contains(text(), 'Keywords:')]"):
                    abstract['Keywords']=' '.join(response.xpath("//div[@class='abstract'][@id='abstract']/p/text()").getall()).strip()
                w['abstract'] = abstract


            path = os.path.join(self.file_storage_location,'PMCID_'+w['pmcid']+'.txt')


            with open(path, 'w') as e:

                for p in w.items():
                    e.write("%s:%s\n\n" % p)

            if 'PMC' in response.xpath(".//div[@class='full-text-links-list']/a/@data-ga-action").getall():
                pdf_url = response.xpath(".//div[@class='full-text-links-list']/a[@data-ga-action='PMC']/@href").getall()[0]


                #for href in response.css('a[href$=".pdf"]::attr(href)').extract():
                yield Request(
                    url=response.urljoin(pdf_url),
                    callback=self.link, meta={'hero_item': w['pmcid']}
                )

            yield(w)
        except Exception as message:
            #print("###############Exception from parse_article")
            raise CloseSpider(message)


    def link(self, response):
        print("################# entering link function")
        try:
            if response.xpath('.//div[@class="format-menu"]/ul/li/a[contains(text(), "PDF")]/@href'):
                link1 = response.xpath('.//div[@class="format-menu"]/ul/li/a[contains(text(), "PDF")]/@href').getall()[0]
                item = response.meta.get('hero_item')
                yield Request(
                        url=response.urljoin(link1),
                        callback=self.save_pdf, meta={'hero_item': item}
                    )
        except Exception as message:
            #print("###############Exception from link")
            pass

    def save_pdf(self, response):
        try:
            print("################# entering pdf function")
            item = response.meta.get('hero_item')
            path = self.path  + "\\"+ "PMCID_" + item + '.pdf'


            self.logger.info('Saving PDF %s', path)
            with open(path, 'wb') as f:
                f.write(response.body)

        except Exception as message:
            pass

Как и в приведенном выше коде, все детали извлекаются в " parse_article ", но одна информация о том, будет ли "pdf_downloaded" определяться из save_pdf функция, которая является функцией обратного вызова.

Теперь данные есть в двух функциях обратного вызова, как я могу добавить их, прежде чем их очистить. Любая помощь приветствуется !!

...