Я использую Scrapy для сканирования сайта. В коде я использую более одной функции обратного вызова, где данные, относящиеся к одному результату поиска, будут получены в двух функциях обратного вызова. Например,
class PubmedProjSpider(CrawlSpider):
name = str(CONFIG.get('project_name', 'project_name'))
start_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=(((((((((((((((((((((((((sodium%20oxybate%5BText%20Word%5D)%20OR%20(Xyrem%5BText%20Word%5D))%20OR%20(certolizumab%20pegol%5BText%20Word%5D))%20OR%20(Cimzia%5BText%20Word%5D))%20OR%20(vancomycin%20hydrochloride%5BText%20Word%5D))%20OR%20(Vancomycin%5BText%20Word%5D))%20OR%20(Vancocin%5BText%20Word%5D))%20OR%20(atorvastatin%20calcium%20trihydrate%5BText%20Word%5D))%20OR%20(atorvastatin%5BText%20Word%5D))%20OR%20(Lipitor))%20OR%20(alprostadil%5BText%20Word%5D))%20OR%20(Caverject%5BText%20Word%5D))%20OR%20(atenolol%5BText%20Word%5D))%20OR%20(Tenormin%5BText%20Word%5D))%20OR%20(tramadol%20hydrochloride%5BText%20Word%5D))%20OR%20(tramadol%5BText%20Word%5D))%20OR%20(Maneo%5BText%20Word%5D))%20OR%20(temazepam%5BText%20Word%5D))%20OR%20(citalopram%20hydrobromide%5BText%20Word%5D))%20OR%20(citalopram%5BText%20Word%5D))%20OR%20(Cipramil%5BText%20Word%5D))%20OR%20(fluticasone%20propionate%5BText%20Word%5D))%20OR%20(fluticasone%5BText%20Word%5D))%20OR%20(Cutivate%5BText%20Word%5D)))%20AND%20((%222020%2F03%2F03%22%5BDate%20-%20Create%5D%20%3A%20%222020%2F03%2F05%22%5BDate%20-%20Create%5D))&filter=simsearch2.ffrft&pos=6']
path = r"C:\Users\vighnesh.paramasivam\Documents\pubmed_organised_copy\pubmed_organised\pubmed\pubmed\output_pdf_files"
def __init__(self):
self.file_storage_location = CONFIG.get('storage_location', 'text_storage_destination')
def parse(self, response):
try:
hxs = Selector(response)
items = []
titles = hxs.xpath("//div[@class='docsum-wrap']//div[@class='docsum-content']")
items.append(titles)
for title in items:
for href in title.xpath("a/@href").extract():
yield Request(
url=response.urljoin(href),
callback=self.parse_article
)
if response.xpath("//button[@class='load-button next-page']"):
temp_url = response.xpath("//div[@data-next-page-url]/@data-next-page-url").getall()[0]
next_page_url = response.urljoin(temp_url)
next_page_url = next_page_url.replace('/more','')
yield Request(
url = next_page_url,
callback=self.parse)
except Exception as message:
#print("###### exception from parse method")
raise CloseSpider(message)
def parse_article(self, response):
try:
w={}
w['title'] = str(' '.join(response.xpath('.//h1[@class="heading-title"]')[0].xpath(".//text()").getall()).encode('utf-8').lstrip().rstrip())
w['url'] = str(response).split(' ')[-1].strip('>')
w['pmcid'] = str(response.xpath(".//ul/li/span[@class='identifier pubmed']/strong[@title='PubMed ID']/text()").getall()[0])
w['authors'] = response.xpath('//div[@class="inline-authors"]/div[@class="authors"]/div[@class="authors-list"]/span/a/text()').getall()
abstract = {'Free-Text':[]}
w['pdf_downloaded'] = 'No'
w['pdf_links'] = ''
q = response.xpath("//div[@class='abstract'][@id='abstract']").getall()
if response.xpath("//div[@class='full-text-links-list']/a/@href"):
w['pdf_links'] = list(set(response.xpath("//div[@class='full-text-links-list']/a/@href").getall()))
if q:
for i in response.xpath("//div[@class='abstract'][@id='abstract']/div[@class='abstract-content selected']/p"):
strong_format = i.xpath("./strong//text()").getall()
bold_format = i.xpath("./b/text()").getall()
if strong_format:
abstract[i.xpath("./strong//text()").getall()[0].strip().strip(':').lstrip()] = ' '.join(i.xpath("./text()").getall()).lstrip().rstrip()
elif bold_format:
headings = response.xpath("//div[@class='abstract'][@id='abstract']/div[@class='abstract-content selected']/p/b/text()").getall()
if headings:
if response.xpath('normalize-space(substring-before(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall():
abstract['Free-Text'] = response.xpath('normalize-space(substring-before(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall()[0]
for num, header in enumerate(headings):
if num != len(headings)-1:
abstract[header] = response.xpath('normalize-space(substring-before(substring-after(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]),//b[.="{}"]))'.format(headings[num], headings[num+1])).getall()[0]
else:
abstract[header] = response.xpath('normalize-space(substring-after(string(//div[@class="abstract"][@id="abstract"]/div[@class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[num])).getall()[0]
else:
abstract['Free-Text'].append((' '.join(i.xpath(".//text()").getall()).lstrip().rstrip()))
if response.xpath("//div[@class='abstract'][@id='abstract']/p/strong[contains(text(), 'Keywords:')]"):
abstract['Keywords']=' '.join(response.xpath("//div[@class='abstract'][@id='abstract']/p/text()").getall()).strip()
w['abstract'] = abstract
path = os.path.join(self.file_storage_location,'PMCID_'+w['pmcid']+'.txt')
with open(path, 'w') as e:
for p in w.items():
e.write("%s:%s\n\n" % p)
if 'PMC' in response.xpath(".//div[@class='full-text-links-list']/a/@data-ga-action").getall():
pdf_url = response.xpath(".//div[@class='full-text-links-list']/a[@data-ga-action='PMC']/@href").getall()[0]
#for href in response.css('a[href$=".pdf"]::attr(href)').extract():
yield Request(
url=response.urljoin(pdf_url),
callback=self.link, meta={'hero_item': w['pmcid']}
)
yield(w)
except Exception as message:
#print("###############Exception from parse_article")
raise CloseSpider(message)
def link(self, response):
print("################# entering link function")
try:
if response.xpath('.//div[@class="format-menu"]/ul/li/a[contains(text(), "PDF")]/@href'):
link1 = response.xpath('.//div[@class="format-menu"]/ul/li/a[contains(text(), "PDF")]/@href').getall()[0]
item = response.meta.get('hero_item')
yield Request(
url=response.urljoin(link1),
callback=self.save_pdf, meta={'hero_item': item}
)
except Exception as message:
#print("###############Exception from link")
pass
def save_pdf(self, response):
try:
print("################# entering pdf function")
item = response.meta.get('hero_item')
path = self.path + "\\"+ "PMCID_" + item + '.pdf'
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
except Exception as message:
pass
Как и в приведенном выше коде, все детали извлекаются в " parse_article ", но одна информация о том, будет ли "pdf_downloaded" определяться из save_pdf функция, которая является функцией обратного вызова.
Теперь данные есть в двух функциях обратного вызова, как я могу добавить их, прежде чем их очистить. Любая помощь приветствуется !!