При выполнении данные паука извлекаются со страницы, но когда начинается конвейер, что-то идет не так ... Я получаю следующую ошибку:
Traceback (последний вызов был последним): Файл "C: \ Users \ EAgnelli \ AppData \ Local \ Continuum \ anaconda3 \ envs \ensorflow \ lib \ site-packages \ twisted \ internet \ defer.py ", строка 654, в _runCallbacks current.result = callback (current.result, * args, ** kw) TypeError: close_spider () отсутствует 1 обязательный позиционный аргумент: 'причина'
Я отправляю запрос через Scrapy Splash для выполнения некоторого Java на странице, а затем извлекаю информацию о ссылках... однако, я впервые получаю эту ошибку.
Это мой паук
import scrapy
import scrapy_splash
from scrapy.linkextractors import LinkExtractor
from cointelegraph_spider.items import CointelegraphSpiderItem
import sqlite3 as sq3
class CointelegraphspiderSpider(scrapy.Spider):
name = 'cointelegraphspider'
allowed_domains = ['cointelegraph.com']
start_urls = ['http://cointelegraph.com/']
def start_requests(self):
"""
Doc string
"""
# Execute the LUA script for "Load Mor" button
script = """
function main(splash, args)
assert(splash:go(args.url))
splash:wait(0.5)
local num_clicks = 2
local delay = 1.5
local load_more = splash:jsfunc(
[[
function ()
{
var el = document.getElementsByClassName('post-preview-list-navigation__btn post-preview-list-navigation__btn_load-more');
el[0].click();
}
]]
)
for _ = 1, num_clicks do
load_more()
splash:wait(delay)
end
return
{
html = splash:html(),
}
end
"""
for url in self.start_urls:
yield scrapy_splash.SplashRequest(
url=url,
callback=self.parse_main_page,
args={
'wait':3,
'lua_source':script,
#'timeout': 3600 # Here the max-timeout is 60 -- to increase it launch the docker with --max-timeout xxxxx
},
endpoint="execute",
)
def parse_main_page(self, response):
"""
Doc string
"""
# Convert Splash response into html response object
html = scrapy.Selector(response)
# Check DB for existing records
conn = sq3.connect("D:\\DCC\\Projects\\crypto_projects\\master_data.db")
db_links = conn.execute("select link from cointelegraph").fetchall() # list of tuples
db_links = [elem[0] for elem in db_links] # flattening list
print("DB LINKS! ", db_links)
#db_links = ["aaa",]
conn.close() # close connection
# Extract all links to be followed
news_links = LinkExtractor(restrict_xpaths=['//ul[@class="post-preview-list-cards"]/li/div/article/a', # Main Body
'//div[@class="main-news-tabs__wrp"]/ul/li/div/a'] # "Editor's Choice" & "Hot Stories"
).extract_links(html.response)
for link in news_links[:2]:
# Follow only new links
if link.url not in db_links:
yield scrapy.Request(link.url, callback=self.parse_article)
def parse_article(self, response):
"""
Doc string
"""
# Create Item for Pipeline
item = CointelegraphSpiderItem()
item['author'] = response.xpath('//div[@class="name"]/a/text()').extract_first().strip()
item['timestamp'] = response.xpath('//div/@datetime').extract_first().split('t')[0] # %Y-%m-%d
item['title'] = response.xpath('//h1[@class="header"]/text()').extract_first().strip()
item['body'] = ' '.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/p//text()').extract())
item['quotes'] = ';;;'.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/blockquote//text()').extract())
item['int_links'] = ';;;'.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/p/a/@href').extract())
_tmp = [elem.replace('#','') for elem in response.xpath('//div[@class="tags"]/ul/li/a/text()').extract()]
item['tags'] = ';;;'.join([elem.replace(' ','') for elem in _tmp])
item['link'] = response.url
item['news_id'] = str(hash(item['link']))
yield item
Это мой конвейер
import sqlite3 as sq3
import sqlite3_functions as sq_f
import logging
from scrapy.exceptions import DropItem
class CointelegraphSpiderPipeline(object):
"""
Doc string
"""
def __init__(self, stats):
"""
Doc string
"""
self.stats = stats
self.db_file = 'D:\\DCC\\Projects\\crypto_projects\\master_data.db'
self.conn = sq3.connect(self.db_file)
self.table_name = 'cointelegraph'
self.commit_counter = 0
@classmethod
def from_crawler(cls, crawler):
"""
Doc string
"""
stats = crawler.stats
return stats #cls(crawler.stats)
def open_spider(self, spider):
"""
Doc string
"""
print("I'm starting the pipeline")
logging.INFO("Starting Pipeline...")
def process_item(self, item, spider):
"""
Doc string
"""
item_checked = True
try:
# Sanity Check
for key, value in item.items():
print("Inside the loop!!!")
if value == '':
item_checked = False
raise DropItem("Item '{0}:{1}' has empty data - Link: {3}".format(key, value, item['link']))
else:
logging.INFO("Item check OK")
item_checked = True
# Insert row and increase counter
if item_checked:
self.conn = sq_f.insert_row(self.db_file, table_name=self.table_name, conn=self.conn, **item)
self.commit_counter += 1
self.conn.commit()
# Commit every 500 inserted rows
if self.commit_counter % 500 == 0:
self.conn.commit()
print(item)
except Exception as e:
logging.WARNING(e)
def close_spider(self, spider):
"""
Doc string
"""
logging.INFO("Commiting rows...")
self.conn.commit()
logging.INFO("Saving spider stats...")
print(self.stats.get_stats())
logging.INFO("Closing pipeline..")
self.conn.close()
и мои настройки:
BOT_NAME = 'cointelegraph_spider'
SPIDER_MODULES = ['cointelegraph_spider.spiders']
NEWSPIDER_MODULE = 'cointelegraph_spider.spiders'
# Splash Settings
SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3699.0 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
#'cointelegraph_spider.middlewares.CointelegraphSpiderSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
#'cointelegraph_spider.middlewares.CointelegraphSpiderDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'cointelegraph_spider.pipelines.CointelegraphSpiderPipeline': 300,
}