Чтобы получить нужные мне данные с веб-сайта с использованием scrapy, мне сначала нужно создать объект ответа, чтобы на нем можно было использовать HtmlXpathSelector.HtmlXpathSelector не принимает строку URL в качестве аргумента.В следующем примере кода переменная response2 пуста, потому что я не знаю, как это сделать.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from tarantula.items import OlgaItem
class OlgaSpider(CrawlSpider):
"""This crawler gets the physician's name and his homepage url."""
name = 'Olga'
DOWNLOAD_DELAY = 6 #para tentar evitar ser banido
#ROBOTSTXT_OBEY = True
#CONCURRENT_REQUESTS = 1
FEED_URI = '/home/mercutio22/gitcode/MedicWebsites.csv'
FEED_FORMAT = 'csv'
USER_AGENT = "Googlebot/2.1 ( http://www.google.com/bot.html )"
#allowed_domains = ['guiareunimedicos.med.br']
start_urls = (
'http://medial-saude.guiareunimedicos.med.br/index.pl?act=searc\
h&_id_=172&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0\
&q=oncologia#results/',
'http://www.guiareunimedicos.med.br/index.pl?act=search&_id_=17\
#2&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0&q=cancer\
#ologia#results/' )
rules = (
Rule(SgmlLinkExtractor(allow=r"V=", restrict_xpaths='//a[text()=">"]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
mdata = hxs.select('//div[contains(@class, "mdata")]')
links = mdata.select('./a/@href').extract()
names = mdata.select('./a/text()').extract()
items = []
for index in range(len(names)):
i = OlgaItem()
i['name'] = names[index]
i['link'] = links[index]
response2 =
hxs2 = HtmlXPathSelector(response2) ###
name = hxs2.select('//big/text()').extract()
i['clinics'] = hxs2.select('//h2/a/text()').extract()
data = hxs2.select('//div[contains(@class, "stab data")]')
addresses = [ x.select('./p/text()').extract() for x in data ]
addresses = [ ''.join(x) for x in addresses ]
addresses = [ x.replace('Telefone(s): \r\n\r\n\r\n', '') for x in addresses ]
addresses = [ x[2:] for x in addresses ]
i['addresses'] = addresses
i['phones'] = hxs2.select('//span[@id]/text()').extract()
items.append(i)
return items