Как оптимизировать веб-скребок с помощью aiohttp и BS4 - PullRequest
0 голосов
/ 15 ноября 2018

Привет, я делаю веб-скребок, чтобы взять некоторые данные из api.mercadolibre и веб-страницы, потому что API не дает мне правдивые данные, я использую aiohttp и модуль асинхронности для асинхронной части и BeautifulSoup для очистки Время выполнения скрипта составляет около 6 часов, но мне нужно, чтобы оно было быстрее, потому что я собираю около 320 тыс. URL, но позже он увеличится до 1 миллиона. Я буду признателен за любую помощь, вот код. Это приложение Django.

import random
import asyncio
from aiohttp import ClientSession
from products.models import Product, ProductVariation
from time import time
from bs4 import SoupStrainer, BeautifulSoup
from django.utils import timezone
from datetime import timedelta
import aiohttp
async def fetch(product, session):
    try:
        async with session.get(product.url_product, timeout=None) as response:
            text = await response.read()
            page_content = BeautifulSoup(text.decode('utf-8'), "html.parser", parse_only=content_product)
            content = page_content.find('div', attrs={"class": "item-conditions"})
            quantity_content = page_content.find('span', attrs={"class": "dropdown-quantity-available"})
            price_content = page_content.find('span', attrs={"class": "price-tag-fraction"})
            sold_quantity = 0
            if content is not None:
                content_split = content.text.split()
                for t in content_split:
                    try:
                        sold_quantity = int(t.replace('.','').replace(',','.'))
                    except ValueError:
                        pass
            else:
                sold_quantity = 0
            if quantity_content is not None:
                quantity_split = quantity_content.text.replace('(', '').replace(')', '').split()
                for a in quantity_split:
                    try:
                        quantity = int(a.replace('.','').replace(',','.'))
                except ValueError:
                        pass
            else:
                quantity = 0
            if price_content is not None:
                price_split = price_content.text.replace('(', '').replace(')', '').split()
                for m in price_split:
                    try:
                        price = int(m.replace('.','').replace(',','.'))
                    except ValueError:
                        pass
            else:
                price = 0

            day_before = ProductVariation.objects.filter(product=product, date_search=(today_date - date_before))
            if day_before:
                sold_day_before = day_before[0].sold_quantity
            else:
                sold_day_before = sold_quantity

            if not ProductVariation.objects.filter(product=product, date_search=today_date):
                diference_day_before = sold_quantity - sold_day_before
                if diference_day_before < 0:
                    diference_day_before = 0


                list_variations.append(
                    ProductVariation(
                        product=product,
                        sold_quantity=sold_quantity,
                        available_quantity=quantity,
                        diference_day_before=diference_day_before
                    )
                )


                print("Variation append")
            else:
                print("Variation already exists")
                pass
                return text
    except aiohttp.client_exceptions.ServerDisconnectedError:
        pass
async def bound_fetch(sem, product, session):
     # Getter function with semaphore.
    async with sem:
        await fetch(product, session)


async def run(r):
    url = "{}"
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(10000)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in range(r):
            # pass Semaphore and session to every GET request
            task = asyncio.ensure_future(bound_fetch(sem, p[i], session))
            tasks.append(task)

        responses = asyncio.gather(*tasks)
        await responses
list_variations = []
today_date = timezone.now()
date_before = timedelta(1)
number = 10000
content_product = SoupStrainer(id='short-desc')
p = Product.objects.filter(sub_category__category__channel__name='Mercadolibre')
count = p.count()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(count))
loop.run_until_complete(future)
...