Я пытаюсь реализовать библиотеку scrapy-puppeteer для моего проекта (https://pypi.org/project/scrapy-puppeteer/)
Я реализую PuppeteerMiddleware в соответствии с документацией из библиотеки
Вот код, который я run:
import asyncio
from twisted.internet import asyncioreactor
asyncioreactor.install(asyncio.get_event_loop())
import scrapy
from scrapy.crawler import CrawlerRunner
from twisted.internet import defer
from twisted.trial.unittest import TestCase
import scrapy_puppeteer
class ScrapyPuppeteerTestCase(TestCase):
"""Test case for the ``scrapy-puppeteer`` package"""
class PuppeteerSpider(scrapy.Spider):
name = 'puppeteer_crawl_spider'
allowed_domains = ['codesandbox.io']
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy_puppeteer.PuppeteerMiddleware': 800
}
}
items = []
def start_requests(self):
yield scrapy_puppeteer.PuppeteerRequest(
'https://codesandbox.io/search?page=1',
wait_until='networkidle2',
)
def parse(self, response):
for selector_item in response.selector.xpath(
'//li[@class="ais-Hits-item"]'):
self.items.append(selector_item.xpath('.//h2').extract_first())
def setUp(self):
"""Store the Scrapy runner to use in the tests"""
self.runner = CrawlerRunner()
@defer.inlineCallbacks
def test_items_number(self):
crawler = self.runner.create_crawler(self.PuppeteerSpider)
yield crawler.crawl()
self.assertEqual(len(crawler.spider.items), 12)
Когда я запускаю его, у меня появляется следующая ошибка:
2020-04-21 14:02:13 [scrapy.utils.log] INFO: Scrapy 1.5.1 started
(bot: test)
2020-04-21 14:02:13 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.18.0, Twisted 20.3.0, Python 3.6.9 (default, Nov 23 2019, 06:41:34) - [GCC 8.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1f 31 Mar 2020), cryptography 2.9, Platform Linux-4.15.0-96-generic-x86_64-with-debian-10.2
2020-04-21 14:02:13 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_MAX_DELAY': 90, 'AUTOTHROTTLE_START_DELAY': 1, 'BOT_NAME': 'test', 'CONCURRENT_REQUESTS': 2, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'DOWNLOAD_DELAY': 0.25, 'DOWNLOAD_MAXSIZE': 36700160, 'DOWNLOAD_TIMEOUT': 90, 'DUPEFILTER_CLASS': 'test.dupefilter.RedisDupeFilter', 'LOG_LEVEL': 'INFO', 'NEWSPIDER_MODULE': 'test.spiders', 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 403, 429], 'RETRY_TIMES': 4, 'ROBOTSTXT_OBEY': True, 'SCHEDULER': 'test.scheduler.Scheduler', 'SPIDER_MODULES': ['test.spiders']}
2020-04-21 14:02:13 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.throttle.AutoThrottle']
[W:pyppeteer.chromium_downloader] start chromium download.
Download may take a few minutes.
100%|██████████| 106826418/106826418 [00:21<00:00, 4914607.73it/s]
[W:pyppeteer.chromium_downloader]
chromium download done.
[W:pyppeteer.chromium_downloader] chromium extracted to: /root/.local/share/pyppeteer/local-chromium/575458
[I:pyppeteer.launcher] terminate chrome process...
Unhandled error in Deferred:
2020-04-21 14:02:39 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 171, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 175, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.6/site-packages/scrapy/core/engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/downloader/__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy_puppeteer/middlewares.py", line 38, in from_crawler
asyncio.ensure_future(cls._from_crawler(crawler))
File "/usr/local/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "/usr/local/lib/python3.6/site-packages/scrapy_puppeteer/middlewares.py", line 27, in _from_crawler
middleware.browser = await launch({'logLevel': crawler.settings.get('LOG_LEVEL')})
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 311, in launch
return await Launcher(options, **kwargs).launch()
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 189, in launch
self.browserWSEndpoint = self._get_ws_endpoint()
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 233, in _get_ws_endpoint
self.proc.stdout.read().decode()
pyppeteer.errors.BrowserError: Browser closed unexpectedly:
[0421/140239.027694:ERROR:zygote_host_impl_linux.cc(89)] Running as root without --no-sandbox is not supported. See https://crbug.com/638180.
2020-04-21 14:02:39 [twisted] CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.6/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.6/site-packages/scrapy/core/engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/downloader/__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/usr/local/lib/python3.6/site-packages/scrapy_puppeteer/middlewares.py", line 38, in from_crawler
asyncio.ensure_future(cls._from_crawler(crawler))
File "/usr/local/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "/usr/local/lib/python3.6/site-packages/scrapy_puppeteer/middlewares.py", line 27, in _from_crawler
middleware.browser = await launch({'logLevel': crawler.settings.get('LOG_LEVEL')})
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 311, in launch
return await Launcher(options, **kwargs).launch()
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 189, in launch
self.browserWSEndpoint = self._get_ws_endpoint()
File "/usr/local/lib/python3.6/site-packages/pyppeteer/launcher.py", line 233, in _get_ws_endpoint
self.proc.stdout.read().decode()
pyppeteer.errors.BrowserError: Browser closed unexpectedly:
[0421/140239.027694:ERROR:zygote_host_impl_linux.cc(89)] Running as root without --no-sandbox is not supported. See https://crbug.com/638180.
Я запускаю свое приложение из docker как отдельный сервис
Это мой dockerfile :
FROM python:3.6
ADD . /code/test
WORKDIR /code/test
RUN pip3 install -r requirements.txt
RUN apt-get update && apt-get install -y nmap
RUN apt-get install gconf-service libasound2 libatk1.0-0 libc6 libcairo2
libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4
libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0
libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1
libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1
libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3
lsb-release xdg-utils wget -y
А это бегун: https://imgur.com/ZrAGeJD
Как я могу это исправить?