Попробуй это. Это поможет вам сохранить PDF.
#!/usr/bin/python
#coding=utf-8
import io,os
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider,SimplifiedDoc
class PdfSpider(Spider):
name = 'quimicanova.sbq.org.br'
def __init__(self):
if(not os.path.exists('pdf/')):
os.mkdir('pdf/')
host = 'http://www.quimicanova.sbq.org.br/default.asp?ed='
self.start_urls = [host + str(i) for i in range(1,285)] # Initialize variable start_urls
Spider.__init__(self,self.name) #necessary
def afterResponse(self, response, url):
try:
# save pdf
if(response.code==200 and url.find('audiencia_pdf.asp?aid2')>0):
name = 'pdf/'+md5(url)+'.pdf'
file = io.open(name, "ab")
file.write(response.read())
file.close()
return None
else:
html = Spider.afterResponse(self, response, url)
return html
except Exception as err:
print (err)
def extract(self, url,html,models,modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(url=url['url'])
urls = []
for u in lstA:
if u['url'].find('audiencia_pdf.asp?aid2')>0:
urls.append(u)
return {"Urls": urls, "Data": urls} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(PdfSpider()) # Start crawling