Я пытаюсь запустить скрапа-паука по сценарию.Я следую документации по скрапингу и выполняю таким образом my_script.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl('followall', domain='scrapinghub.com')
process.start() # the script will block here until the crawling is finished
process.stop()
И я должен манипулировать следующим пауком таким образом для целей тестирования. followall.py
import re
from six.moves.urllib.parse import urlparse
import scrapy
from scrapy.http import Request, HtmlResponse
from scrapy.linkextractors import LinkExtractor
from testspiders.items import Page
class FollowAllSpider(scrapy.Spider):
name = 'followall'
def __init__(self, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
def start_requests(self):
return [Request(self.url, callback=self.parse, dont_filter=True)]
def parse(self, response):
print ('\n\n===============This is followall ==================\n\n')
, когда я запускаю сценарий my_script.py , он пытается запустить followall spider два раза и завершитьсяследующее исключение
C:\Users\admin\AppData\Local\Programs\Python\Python36-32\python.exe C:/Users/admin/PycharmProjects/Denave_Python_Projects/testspiders-master/testspiders/fisttest.py
2018-04-24 18:51:18 [scrapy.utils.log] **INFO: Scrapy 1.5.0 started (bot: testspiders)**
2018-04-24 18:51:18 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 16:07:46) [MSC v.1900 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.15063-SP0
2018-04-24 18:51:18 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'testspiders', 'CLOSESPIDER_PAGECOUNT': 1000, 'CLOSESPIDER_TIMEOUT': 3600, 'COOKIES_ENABLED': False, 'NEWSPIDER_MODULE': 'testspiders.spiders', 'RETRY_ENABLED': False, 'SPIDER_MODULES': ['testspiders.spiders']}
2018-04-24 18:51:18 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.closespider.CloseSpider',
'scrapy.extensions.logstats.LogStats']
2018-04-24 18:51:18 [scrapy.middleware] INFO: Enabled downloader middlewares:
['testspiders.middleware.RandomUserAgent',
'testspiders.middleware.ErrorMonkeyMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-04-24 18:51:18 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-04-24 18:51:18 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2018-04-24 18:51:18 [scrapy.core.engine] INFO: Spider opened
2018-04-24 18:51:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-24 18:51:18 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-04-24 18:51:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://scrapinghub.com/> from <GET http://scrapinghub.com/>
2018-04-24 18:51:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapinghub.com/> (referer: None)
2018-04-24 18:51:19 [scrapy.core.engine] INFO: Closing spider (finished)
2018-04-24 18:51:19 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
**===============This is followall ==================**
{'downloader/request_bytes': 624,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 13326,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 4, 24, 13, 21, 19, 730915),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2018, 4, 24, 13, 21, 18, 498736)}
2018-04-24 18:51:19 [scrapy.core.engine] **INFO: Spider closed (finished)**
2018-04-24 18:51:19 [scrapy.utils.log] **INFO: Scrapy 1.5.0 started (bot: testspiders)**
2018-04-24 18:51:19 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 16:07:46) [MSC v.1900 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.15063-SP0
2018-04-24 18:51:19 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'testspiders', 'CLOSESPIDER_PAGECOUNT': 1000, 'CLOSESPIDER_TIMEOUT': 3600, 'COOKIES_ENABLED': False, 'NEWSPIDER_MODULE': 'testspiders.spiders', 'RETRY_ENABLED': False, 'SPIDER_MODULES': ['testspiders.spiders']}
2018-04-24 18:51:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.closespider.CloseSpider',
'scrapy.extensions.logstats.LogStats']
2018-04-24 18:51:19 [scrapy.middleware] INFO: Enabled downloader middlewares:
['testspiders.middleware.RandomUserAgent',
'testspiders.middleware.ErrorMonkeyMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-04-24 18:51:19 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-04-24 18:51:19 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2018-04-24 18:51:19 [scrapy.core.engine] INFO: Spider opened
2018-04-24 18:51:19 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-04-24 18:51:19 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
Traceback (most recent call last):
File "C:/Users/admin/PycharmProjects/Denave_Python_Projects/testspiders-master/testspiders/fisttest.py", line 6, in <module>
process.start() # the script will block here until the crawling is finished
File "C:\Users\admin\AppData\Local\Programs\Python\Python36-32\lib\site-packages\scrapy\crawler.py", line 291, in start
reactor.run(installSignalHandlers=False) # blocking call
File "C:\Users\admin\AppData\Local\Programs\Python\Python36-32\lib\site-packages\twisted\internet\base.py", line 1242, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "C:\Users\admin\AppData\Local\Programs\Python\Python36-32\lib\site-packages\twisted\internet\base.py", line 1222, in startRunning
ReactorBase.startRunning(self)
File "C:\Users\admin\AppData\Local\Programs\Python\Python36-32\lib\site-packages\twisted\internet\base.py", line 730, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
Process finished with exit code 1
Вопросы:
1.Любое решение для этой ошибки twisted.internet.error.ReactorNotRestartable в этом случае
2.Любой другой способ запуска уже написанного паука из Python Script