Python / Quart: как избежать тайм-аутов на Heroku - PullRequest
0 голосов
/ 12 июля 2020

Я работаю над API, который обрабатывает около 50 новостных статей и выполняет анализ настроений. Обычно это работает нормально, но, к сожалению, не может быть развернуто на Heroku, так как у меня возникают таймауты. Я переписал API с Flask на Quart, и все стало быстрее (менее 30 секунд на моем локальном компьютере), но это зависит от Heroku, и бывают случаи, когда у меня возникают таймауты. Как я могу этого избежать?

Может ли кто-нибудь также подтвердить правильность моего асинхронного c кода? Я новичок в Asyn c и Python в целом.

from quart import Quart, jsonify
from quart_cors import cors
import newspaper
from date_guesser import guess_date, Accuracy
from langdetect import detect, detect_langs
import json
import asyncio
import aiohttp
import httpx
from requests.exceptions import HTTPError
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')

app = Quart(__name__)
app = cors(app, allow_origin="http://localhost:3000/")
app.config['JSON_SORT_KEYS'] = False


@app.route('/', methods=['GET'])
def start():
    return 'Welcome!', 200


finalJSON = []
urls = []


async def main(urls):
    tasks = []
    async with aiohttp.ClientSession() as session:
        for url in urls:
            task = asyncio.ensure_future(get_article(session, url))
            tasks.append(task)
        await asyncio.wait(tasks)


async def get_article(session, url):
    if url == None:
        return 'url parameter is required', 400

    async with session.get(url) as response:
        content = await response.read()
        article = newspaper.Article(url)
        article.set_html(content)

        if response.status == 200:
            article.parse()
            article_dict = {}
            article_dict['status'] = 'ok'

            article_dict['article'] = {}
            article_dict['article']['source_url'] = article.source_url

            try:
                guess = guess_date(url=url, html=article.html)
                article_dict['article']['published'] = guess.date
                article_dict['article']['published_method_found'] = guess.method
                article_dict['article']['published_guess_accuracy'] = None
                if guess.accuracy is Accuracy.PARTIAL:
                    article_dict['article']['published_guess_accuracy'] = 'partial'
                if guess.accuracy is Accuracy.DATE:
                    article_dict['article']['published_guess_accuracy'] = 'date'
                if guess.accuracy is Accuracy.DATETIME:
                    article_dict['article']['published_guess_accuracy'] = 'datetime'
                if guess.accuracy is Accuracy.NONE:
                    article_dict['article']['published_guess_accuracy'] = None
            except:
                article_dict['article']['published'] = article.publish_date
                article_dict['article']['published_method_found'] = None
                article_dict['article']['published_guess_accuracy'] = None

            article.nlp()

            article_dict['article']['title'] = article.title
            article_dict['article']['summary'] = article.summary
            article_dict['article']['keywords'] = list(article.keywords)
            article_dict['article']['authors'] = list(article.authors)

            sentiString = article.title + article.summary
            analyzer = SentimentIntensityAnalyzer()
            vs = analyzer.polarity_scores(sentiString)

            try:
                title_lang = detect(article.title)
            except:
                title_lang = None

            try:
                text_lang = detect(article.text)
            except:
                text_lang = None

            article_dict['article']['images'] = list(article.images)
            article_dict['article']['top_image'] = article.top_image
            article_dict['article']['meta_image'] = article.meta_img
            article_dict['article']['movies'] = list(article.movies)
            article_dict['article']['meta_keywords'] = list(
                article.meta_keywords)
            article_dict['article']['tags'] = list(article.tags)
            article_dict['article']['meta_description'] = article.meta_description
            article_dict['article']['meta_lang'] = article.meta_lang
            article_dict['article']['title_lang'] = str(title_lang)
            article_dict['article']['text_lang'] = str(text_lang)
            article_dict['article']['meta_favicon'] = article.meta_favicon
            article_dict['article']['url'] = str(url)
            article_dict['article']['positive'] = vs['pos']
            article_dict['article']['neutral'] = vs['neu']
            article_dict['article']['negative'] = vs['neg']
            finalJSON.append(article_dict)
            # return article_dict
            # finalJSON = {**finalJSON, **article_dict}
        else:
            article_dict = {}
            article_dict['status'] = 'error'
            article_dict['article'] = 'An error occured parsing this article'
            # return article_dict
            finalJSON.append(article_dict)


@app.route('/v0/goodvibes/', methods=['GET'])
async def get_reddit_urls():
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(
                'https://reddit.com/r/UpliftingNews/hot/.json?limit=40')
            print(response)
            # Access JSON Content
            for url in response.json()['data']['children']:
                urls.append(url['data']['url'])
        # Await Tasks
        await main(urls)
        # Return Final JSON Data
        return jsonify(finalJSON), 200
    except Exception as err:
        return str(err), 400


# if __name__ == '__main__':
#     app.run()

Я также установил WEB_CONCURRENCY на Heroku на 3, но это, похоже, не очень помогло.

...