Привет, я делаю веб-скребок, чтобы взять некоторые данные из api.mercadolibre и веб-страницы, потому что API не дает мне правдивые данные, я использую aiohttp и модуль асинхронности для асинхронной части и BeautifulSoup для очистки Время выполнения скрипта составляет около 6 часов, но мне нужно, чтобы оно было быстрее, потому что я собираю около 320 тыс. URL, но позже он увеличится до 1 миллиона. Я буду признателен за любую помощь, вот код. Это приложение Django.
import random
import asyncio
from aiohttp import ClientSession
from products.models import Product, ProductVariation
from time import time
from bs4 import SoupStrainer, BeautifulSoup
from django.utils import timezone
from datetime import timedelta
import aiohttp
async def fetch(product, session):
try:
async with session.get(product.url_product, timeout=None) as response:
text = await response.read()
page_content = BeautifulSoup(text.decode('utf-8'), "html.parser", parse_only=content_product)
content = page_content.find('div', attrs={"class": "item-conditions"})
quantity_content = page_content.find('span', attrs={"class": "dropdown-quantity-available"})
price_content = page_content.find('span', attrs={"class": "price-tag-fraction"})
sold_quantity = 0
if content is not None:
content_split = content.text.split()
for t in content_split:
try:
sold_quantity = int(t.replace('.','').replace(',','.'))
except ValueError:
pass
else:
sold_quantity = 0
if quantity_content is not None:
quantity_split = quantity_content.text.replace('(', '').replace(')', '').split()
for a in quantity_split:
try:
quantity = int(a.replace('.','').replace(',','.'))
except ValueError:
pass
else:
quantity = 0
if price_content is not None:
price_split = price_content.text.replace('(', '').replace(')', '').split()
for m in price_split:
try:
price = int(m.replace('.','').replace(',','.'))
except ValueError:
pass
else:
price = 0
day_before = ProductVariation.objects.filter(product=product, date_search=(today_date - date_before))
if day_before:
sold_day_before = day_before[0].sold_quantity
else:
sold_day_before = sold_quantity
if not ProductVariation.objects.filter(product=product, date_search=today_date):
diference_day_before = sold_quantity - sold_day_before
if diference_day_before < 0:
diference_day_before = 0
list_variations.append(
ProductVariation(
product=product,
sold_quantity=sold_quantity,
available_quantity=quantity,
diference_day_before=diference_day_before
)
)
print("Variation append")
else:
print("Variation already exists")
pass
return text
except aiohttp.client_exceptions.ServerDisconnectedError:
pass
async def bound_fetch(sem, product, session):
# Getter function with semaphore.
async with sem:
await fetch(product, session)
async def run(r):
url = "{}"
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(10000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(bound_fetch(sem, p[i], session))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
list_variations = []
today_date = timezone.now()
date_before = timedelta(1)
number = 10000
content_product = SoupStrainer(id='short-desc')
p = Product.objects.filter(sub_category__category__channel__name='Mercadolibre')
count = p.count()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(count))
loop.run_until_complete(future)