Во-первых, вы не делаете никаких запросов с помощью скрапа, также вы комбинируете scrapy
с requests
, что, я думаю, не лучшая идея.Попробуйте изменить __init__
на:
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
Кроме того, ваш self._parse_sitemap
СЛЕДУЕТ вернуть dict-like
или Request
(не только ваш self._parse_sitemap
, каждая функция в вашемпаук, см. документы ):
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
Весь файл (возможно, не работает, но объясняет идею):
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l