Я сейчас пишу скребок со скрапом. Для некоторых сайтов это работает просто отлично, но для других я получаю ошибку
Ошибка чтения файла '': не удалось загрузить внешний объект ""
Вот код, который я написал для моего скребка, не вините меня, но я все еще новичок в python.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
#from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
#from scrapy.exporters import XmlItemExporter
import re
cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.remove_tags = ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'figure', 'small', 'blockquote', 'sub', 'em', 'hr', '!--..--', 'span', 'aside', 'a', 'svg', 'ul', 'li', 'img', 'source', 'nav', 'article', 'section', 'label', 'br', 'noscript', 'body', 'time', 'b', 'i', 'sup', 'strong', 'div']
cleaner.kill_tags = ['header', 'footer']
class MySpider(CrawlSpider):
name = 'eship5'
allowed_domains = [
'ineratec.de',
]
start_urls = [
'http://ineratec.de/',
]
rules = [Rule(LinkExtractor(), callback='parse_item', follow=True)] # Follow any link scrapy finds (that is allowed).
def parse_item(self, response):
page = response.url.replace("/"," ").replace(":"," ")
filename = '%s.txt' %page
body = response.url
clean_text = lxml.html.tostring(cleaner.clean_html(lxml.html.parse(body)))
#clean_text = re.sub( '\s+', ' ', str(clean_text, "utf-8").replace('<div>', '').replace('</div>', '')).strip()
with open(filename, 'w') as f:
f.write(clean_text)
Когда я запускаю код с scrapy, ошибка возникает только на определенных сайтах. Имеет ли это какое-либо отношение к «» и «»? Благодарю за любую помощь.
EDIT1:
Это вся ошибка:
2018-06-28 14:01:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://smight.com/en/> (referer: https://smight.com/)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 30, in process_spider_output
for x in result:
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 76, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
File "/Users/gnus/Desktop/scraper/scraper/spiders/scraper.py", line 33, in parse_item
clean_text = lxml.html.tostring(cleaner.clean_html(lxml.html.parse(body)))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lxml/html/__init__.py", line 940, in parse
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
File "src/lxml/etree.pyx", line 3426, in lxml.etree.parse
File "src/lxml/parser.pxi", line 1839, in lxml.etree._parseDocument
File "src/lxml/parser.pxi", line 1865, in lxml.etree._parseDocumentFromURL
File "src/lxml/parser.pxi", line 1769, in lxml.etree._parseDocFromFile
File "src/lxml/parser.pxi", line 1162, in lxml.etree._BaseParser._parseDocFromFile
File "src/lxml/parser.pxi", line 600, in lxml.etree._ParserContext._handleParseResultDoc
File "src/lxml/parser.pxi", line 710, in lxml.etree._handleParseResult
File "src/lxml/parser.pxi", line 637, in lxml.etree._raiseParseError
OSError: Error reading file 'https://smight.com/en/': failed to load external entity "https://smight.com/en/"