ошибка scrapy: ошибка чтения файла '': не удалось загрузить внешний объект "" - PullRequest
0 голосов
/ 28 июня 2018

Я сейчас пишу скребок со скрапом. Для некоторых сайтов это работает просто отлично, но для других я получаю ошибку

Ошибка чтения файла '': не удалось загрузить внешний объект ""

Вот код, который я написал для моего скребка, не вините меня, но я все еще новичок в python.

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
#from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
#from scrapy.exporters import XmlItemExporter
import re

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.remove_tags = ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'figure', 'small', 'blockquote', 'sub', 'em', 'hr', '!--..--', 'span', 'aside', 'a', 'svg', 'ul', 'li', 'img', 'source', 'nav', 'article', 'section', 'label', 'br', 'noscript', 'body', 'time', 'b', 'i', 'sup', 'strong', 'div']
cleaner.kill_tags = ['header', 'footer']

class MySpider(CrawlSpider):
    name = 'eship5'
    allowed_domains = [
    'ineratec.de',
    ]

    start_urls = [
    'http://ineratec.de/',
    ]

    rules = [Rule(LinkExtractor(), callback='parse_item', follow=True)] # Follow any link scrapy finds (that is allowed).


    def parse_item(self, response):
        page = response.url.replace("/"," ").replace(":"," ")
        filename = '%s.txt' %page
        body = response.url
        clean_text = lxml.html.tostring(cleaner.clean_html(lxml.html.parse(body)))
        #clean_text = re.sub( '\s+', ' ', str(clean_text, "utf-8").replace('<div>', '').replace('</div>', '')).strip()
        with open(filename, 'w') as f:
            f.write(clean_text)

Когда я запускаю код с scrapy, ошибка возникает только на определенных сайтах. Имеет ли это какое-либо отношение к «» и «»? Благодарю за любую помощь.

EDIT1: Это вся ошибка:

2018-06-28 14:01:18 [scrapy.core.scraper] ERROR: Spider error processing <GET https://smight.com/en/> (referer: https://smight.com/)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 30, in process_spider_output
    for x in result:
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 76, in _parse_response
    cb_res = callback(response, **cb_kwargs) or ()
  File "/Users/gnus/Desktop/scraper/scraper/spiders/scraper.py", line 33, in parse_item
    clean_text = lxml.html.tostring(cleaner.clean_html(lxml.html.parse(body)))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lxml/html/__init__.py", line 940, in parse
    return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
  File "src/lxml/etree.pyx", line 3426, in lxml.etree.parse
  File "src/lxml/parser.pxi", line 1839, in lxml.etree._parseDocument
  File "src/lxml/parser.pxi", line 1865, in lxml.etree._parseDocumentFromURL
  File "src/lxml/parser.pxi", line 1769, in lxml.etree._parseDocFromFile
  File "src/lxml/parser.pxi", line 1162, in lxml.etree._BaseParser._parseDocFromFile
  File "src/lxml/parser.pxi", line 600, in lxml.etree._ParserContext._handleParseResultDoc
  File "src/lxml/parser.pxi", line 710, in lxml.etree._handleParseResult
  File "src/lxml/parser.pxi", line 637, in lxml.etree._raiseParseError
OSError: Error reading file 'https://smight.com/en/': failed to load external entity "https://smight.com/en/"
...