Захватить данные URL продукта в случае нумерации страниц с помощью BeautifulSoup? - PullRequest
0 голосов
/ 05 июня 2018

Я получил html-данные с сайта и пытаюсь получить URL продукта:

def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
return soup

def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    categories = soup.find('div', attrs={'id': 'menu_oc'})
    if categories is not None:
        for c in categories.findAll('a'):
        if c['href'] is not None:
            cat_urls.append(c['href'])
        return cat_urls

def get_product_urls(url):
     soup = get_soup(url)
     prod_urls = []
     if soup.find('div', attrs={'class': 'pagination'}):
         for link in soup.select('div.links a'):
             if link.string.isdecimal():  # dump next and last links
                prod_urls.append(link['href'])
     print("Found following product urls::", prod_urls)
     return prod_urls

if __name__ == '__main__':
category_urls = get_category_urls(URL)
product_urls = get_product_urls(URL)

Как эффективно определить условие нумерации страниц в приведенном выше местоположении?

снимки экрана реального сайтас нумерацией страниц:

with pagination

и без нумерации страниц:

without pagination

1 Ответ

0 голосов
/ 05 июня 2018

должно быть в порядке

from bs4 import BeautifulSoup
import requests


def get_soup(url):
    soup = None
    response = requests.get(url)
    if response.status_code == 200:
        html = response.content
        soup = BeautifulSoup(html, "html.parser")
    return soup


def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    categories = soup.find('div', attrs={'id': 'menu_oc'})
    if categories is not None:
        for c in categories.findAll('a'):
            if c['href'] is not None:
                cat_urls.append(c['href'])
        return cat_urls


def get_all_products(url):

    prod_urls = []

    soup = get_soup(url)
    prod_urls.append(get_product_urls(soup))

    links = get_pagination(soup)
    print("Found those pages:", links)
    if not links:
        return prod_urls

    for link in links:
        soup = get_soup(link)
        prod_urls.append(get_product_urls(soup))

    print("Found following product urls:", prod_urls)
    return prod_urls


def get_product_urls(soup):
    links = soup.select('div.product-list .span .name a')
    return [link['href'] for link in links]


def get_pagination(soup):
    pages = soup.select('div.pagination div.links a')
    return [link['href'] for link in pages if link.string.isdecimal()]


if __name__ == '__main__':
    URL = 'http://www.example.com/shop/index.php?route=product/category&path=63_64'
    category_urls = get_category_urls(URL)
    product_urls = get_all_products(URL)
...