При запуске scrapy spider с APScheduler вывод не создается - PullRequest
1 голос
/ 14 июля 2020

Вот мой файл, который запускает Spider с помощью APScheduler

from scrapy.crawler import CrawlerProcess
from newsville_scrapy.spiders.news import NewsSpider
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler
from scrapy.utils.log import configure_logging

configure_logging()

process = CrawlerProcess(get_project_settings())
sched = TwistedScheduler()
sched.add_job(process.crawl,'cron',args=[NewsSpider],day_of_week='mon-sun',hour=17,minute=45)
sched.start()
process.start(False)

Spider работает успешно, но нет вывода. Вот лог

2020-07-13T17:45:00.353215+00:00 app[clock.1]:  'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]:  'scrapy.spidermiddlewares.referer.RefererMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]:  'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
2020-07-13T17:45:00.353216+00:00 app[clock.1]:  'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-13T17:45:00.390941+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.middleware] INFO: Enabled item pipelines:
2020-07-13T17:45:00.390943+00:00 app[clock.1]: ['newsville_scrapy.pipelines.NewsvilleScrapyPipeline']
2020-07-13T17:45:00.391115+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.core.engine] INFO: Spider opened
2020-07-13T17:45:00.423097+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-13T17:45:00.424713+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-13T17:45:00.426194+00:00 app[clock.1]: 2020-07-13 17:45:00 [apscheduler.executors.default] INFO: Job "CrawlerRunner.crawl (trigger: cron[day_of_week='mon-sun', hour='17', minute='45'], next run at: 2020-07-14 17:45:00 UTC)" executed successfully

Вот мой паук. Это размещено на Heroku, которое запускает задание в нужный момент, но ничего не очищает. Нужна помощь для того же. Кроме того, я новичок в scrapy, поэтому не могу понять, почему он не работает.

class NewsSpider(scrapy.Spider):
    name = "news"

    start_urls = ['https://zeenews.india.com/latest-news']

    
    def parse(self, response):

        country_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[4]/a/@href').get()
        country_url = response.urljoin(country_path)

        item = NewsvilleScrapyItem()
        count = 1
        latest_news = defaultdict(dict)
        
        for data in response.css('div.section-article'):

            latest_news[count]['latest_headlines'] = data.css('h3.margin-bt10px a::text').get()
            content = data.css('p::text').get()
            latest_news[count]['latest_content'] = content.strip()
            latest_news[count]['latest_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
            count += 1
        
        item['latest_news'] = latest_news

        latest_news_dict = item['latest_news']

        latest_news_dict = item['latest_news']

        yield response.follow(country_url, callback = self.parse_country_news, meta={'item': item})
        
        # yield item


    def parse_country_news(self, response):

        item = response.meta['item']
        state_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[6]/a/@href').get()
        state_url = response.urljoin(state_path)
        count = 1
        country_news = defaultdict(dict)

        for data in response.css('div.section-article'):

            country_news[count]['country_headlines'] = data.css('h3.margin-bt10px a::text').get()
            content = data.css('p::text').get()
            country_news[count]['country_content'] = content.strip()
            country_news[count]['country_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
            count += 1
        
        item['country_news'] = country_news
        yield response.follow(state_url, callback=self.parse_state_news, meta={'item': item})

    def parse_state_news(self, response):

        item = response.meta['item']
        cricket_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[8]/a/@href').get()
        cricket_url = response.urljoin(cricket_path)
        count = 1
        state_news = defaultdict(dict)

        for data in response.css('div.block-height'):

            state = data.css('h2.head-two a::text').get()
            if state != 'Videos':
                state_news[count]['state_name'] = state
                state_news[count]['state_image_urls'] = data.css('div.h-nation-lead img').xpath('@src').get()
                lead_text = data.css('h3.nation-lead-txt::text').extract()

                lead_text = ' '.join(lead_text)
                state_news[count]['state_lead_text'] = lead_text.strip()

                for lines in data.css('div.one-liner'):
                    state_news[count]['state_one_liners'] = lines.css('h3 a').css('::text').extract()
                
                count += 1

            item['state_news'] = state_news

            state_news_dict = item['state_news']
        
            yield response.follow(cricket_url, callback=self.parse_cricket_news, meta={'item':item})


    def parse_cricket_news(self, response):

        item = response.meta['item']
        ent_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[9]/a/@href').get()
        ent_url = response.urljoin(ent_path)
        count = 1
        cricket_news = defaultdict(dict)

        for data in response.css('div.section-article'):

            cricket_news[count]['cricket_headlines'] = data.css('h3.margin-bt10px a::text').get()
            content = data.css('p::text').get()
            cricket_news[count]['cricket_content'] = content.strip()
            cricket_news[count]['cricket_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
            count += 1

        item['cricket_news'] = cricket_news
            
        yield response.follow(ent_url, callback=self.parse_entertainment_news, meta={'item':item})
            


    def parse_entertainment_news(self, response):

        item = response.meta['item']
        world_path = response.xpath('/html/body/div[3]/header/div[3]/div/div[2]/div[2]/ul/li[11]/a/@href').get()
        world_url = response.urljoin(world_path)
        count = 1
        ent_news = defaultdict(dict)

        for data in response.css('div.block-height'):
            
            ent_category = data.css('h2.head-two a::text').get()
            if ent_category != 'Videos':
                ent_news[count]['ent_name'] = ent_category
                ent_news[count]['ent_image_urls'] = data.css('div.h-nation-lead img').xpath('@src').get()
                lead_text = data.css('h3.nation-lead-txt a::text').extract()

                lead_text = ' '.join(lead_text)
                ent_news[count]['ent_lead_text'] = lead_text.strip()

                for lines in data.css('div.one-liner'):

                    ent_news[count]['ent_one_liners'] = lines.css('h3 a').css('::text').extract()
                
                count += 1
        
        item['ent_news'] = ent_news
    
        yield response.follow(world_url, callback=self.parse_world_news, meta={'item':item})

    def parse_world_news(self, response):

        item = response.meta['item']
        count = 1
        world_news = defaultdict(dict)

        for data in response.css('div.section-article'):
            
            world_news[count]['world_headlines'] = data.css('h3.margin-bt10px a::text').get()
            content = data.css('p::text').get()
            world_news[count]['world_content'] = content.strip()
            world_news[count]['world_image_urls'] = data.css('img.img-responsive').xpath('@src').get()
            count += 1

        item['world_news'] = world_news

        yield item

1 Ответ

0 голосов
/ 14 июля 2020

2020-07-13T17:45:00.423097+00:00 app[clock.1]: 2020-07-13 17:45:00 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

Значит, страницы не просканированы. Следовательно, нет выхода. Проверьте, правильно ли вы указали URL-адреса

...