Вот мой файл, который запускает Spider с помощью APScheduler
from scrapy.crawler import CrawlerProcess
from newsville_scrapy.spiders.news import NewsSpider
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler
from scrapy.utils.log import configure_logging
configure_logging()
process = CrawlerProcess(get_project_settings())
sched = TwistedScheduler()
sched.add_job(process.crawl,'cron',args=[NewsSpider],day_of_week='mon-sun',hour=17,minute=45)
sched.start()
process.start(False)
Spider работает успешно, но нет вывода. Вот лог
2020-07-13T17:45:00.353215+00:00 app[clock.1]: 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]: 'scrapy.spidermiddlewares.referer.RefererMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]: 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]: 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-13T17:45:00.390941+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.middleware] INFO: Enabled item pipelines:
2020-07-13T17:45:00.390943+00:00 app[clock.1]: ['newsville_scrapy.pipelines.NewsvilleScrapyPipeline']
2020-07-13T17:45:00.391115+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.core.engine] INFO: Spider opened
2020-07-13T17:45:00.423097+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-13T17:45:00.424713+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-13T17:45:00.426194+00:00 app[clock.1]: 2020-07-13 17:45:00 [apscheduler.executors.default] INFO: Job "CrawlerRunner.crawl (trigger: cron[day_of_week='mon-sun', hour='17', minute='45'], next run at: 2020-07-14 17:45:00 UTC)" executed successfully
Вот мой паук. Это размещено на Heroku, которое запускает задание в нужный момент, но ничего не очищает. Нужна помощь для того же. Кроме того, я новичок в scrapy, поэтому не могу понять, почему он не работает.
class NewsSpider(scrapy.Spider):
name = "news"
start_urls = ['https://zeenews.india.com/latest-news']
def parse(self, response):
country_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[4]/a/@href').get()
country_url = response.urljoin(country_path)
item = NewsvilleScrapyItem()
count = 1
latest_news = defaultdict(dict)
for data in response.css('div.section-article'):
latest_news[count]['latest_headlines'] = data.css('h3.margin-bt10px a::text').get()
content = data.css('p::text').get()
latest_news[count]['latest_content'] = content.strip()
latest_news[count]['latest_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
count += 1
item['latest_news'] = latest_news
latest_news_dict = item['latest_news']
latest_news_dict = item['latest_news']
yield response.follow(country_url, callback = self.parse_country_news, meta={'item': item})
# yield item
def parse_country_news(self, response):
item = response.meta['item']
state_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[6]/a/@href').get()
state_url = response.urljoin(state_path)
count = 1
country_news = defaultdict(dict)
for data in response.css('div.section-article'):
country_news[count]['country_headlines'] = data.css('h3.margin-bt10px a::text').get()
content = data.css('p::text').get()
country_news[count]['country_content'] = content.strip()
country_news[count]['country_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
count += 1
item['country_news'] = country_news
yield response.follow(state_url, callback=self.parse_state_news, meta={'item': item})
def parse_state_news(self, response):
item = response.meta['item']
cricket_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[8]/a/@href').get()
cricket_url = response.urljoin(cricket_path)
count = 1
state_news = defaultdict(dict)
for data in response.css('div.block-height'):
state = data.css('h2.head-two a::text').get()
if state != 'Videos':
state_news[count]['state_name'] = state
state_news[count]['state_image_urls'] = data.css('div.h-nation-lead img').xpath('@src').get()
lead_text = data.css('h3.nation-lead-txt::text').extract()
lead_text = ' '.join(lead_text)
state_news[count]['state_lead_text'] = lead_text.strip()
for lines in data.css('div.one-liner'):
state_news[count]['state_one_liners'] = lines.css('h3 a').css('::text').extract()
count += 1
item['state_news'] = state_news
state_news_dict = item['state_news']
yield response.follow(cricket_url, callback=self.parse_cricket_news, meta={'item':item})
def parse_cricket_news(self, response):
item = response.meta['item']
ent_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[9]/a/@href').get()
ent_url = response.urljoin(ent_path)
count = 1
cricket_news = defaultdict(dict)
for data in response.css('div.section-article'):
cricket_news[count]['cricket_headlines'] = data.css('h3.margin-bt10px a::text').get()
content = data.css('p::text').get()
cricket_news[count]['cricket_content'] = content.strip()
cricket_news[count]['cricket_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
count += 1
item['cricket_news'] = cricket_news
yield response.follow(ent_url, callback=self.parse_entertainment_news, meta={'item':item})
def parse_entertainment_news(self, response):
item = response.meta['item']
world_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[11]/a/@href').get()
world_url = response.urljoin(world_path)
count = 1
ent_news = defaultdict(dict)
for data in response.css('div.block-height'):
ent_category = data.css('h2.head-two a::text').get()
if ent_category != 'Videos':
ent_news[count]['ent_name'] = ent_category
ent_news[count]['ent_image_urls'] = data.css('div.h-nation-lead img').xpath('@src').get()
lead_text = data.css('h3.nation-lead-txt a::text').extract()
lead_text = ' '.join(lead_text)
ent_news[count]['ent_lead_text'] = lead_text.strip()
for lines in data.css('div.one-liner'):
ent_news[count]['ent_one_liners'] = lines.css('h3 a').css('::text').extract()
count += 1
item['ent_news'] = ent_news
yield response.follow(world_url, callback=self.parse_world_news, meta={'item':item})
def parse_world_news(self, response):
item = response.meta['item']
count = 1
world_news = defaultdict(dict)
for data in response.css('div.section-article'):
world_news[count]['world_headlines'] = data.css('h3.margin-bt10px a::text').get()
content = data.css('p::text').get()
world_news[count]['world_content'] = content.strip()
world_news[count]['world_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
count += 1
item['world_news'] = world_news
yield item