Как выполнить разбиение на страницы на всех страницах веб-сайта, используя запросы и BeautifulSoup в python - PullRequest
0 голосов
/ 27 апреля 2020

Так что я пытался вычеркнуть названия и цены на занавески для душа с этого сайта. Сайт имеет более 200 страниц, но этот код работает только для первых 100 страниц, а затем повторяет очистку тех же 100 страниц.

import requests
from bs4 import BeautifulSoup
import re
import csv

site = "https://ih1.redbubble.net/image.{}/ur,shower_curtain_closed,square,600x600.1.jpg"

firstrow = ['No.', 'Name', 'Price', 'Image Url']
with open('curtains.csv', 'a', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(firstrow)
    csvFile.close()


def main(url):
    count = 0
    for page in range(1,205):
        print('\n','*'*10 , 'Scraping Page # {}'.format(page) , '*'*10)
        print('Link # {}'.format(url.format(page)))
        final_url = url.format(page)
        r = requests.get(final_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        target = soup.select("img[class*=styles__rounded--1lyoH]")
        imgs = [img.group(1) for img in re.finditer(r'\.(\d+\.\d{4})', r.text)]
        goal = list(dict.fromkeys(imgs))
        for tar, go in zip(target, goal):
            count += 1

            name = tar['alt']
            price = tar.find_all_next('span')[3].text
            img = site.format(go)

            print('*'*20 , count , '*'*20)
            print('Name: {}'.format(name))
            print('Price: {}'.format(price))
            print('Image Url: {}'.format(img))


            row = [count, name, price, img]
            with open('curtains.csv', 'a', newline='' , encoding='utf-8') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(row)
                csvFile.close()


main("https://www.redbubble.com/shop/shower-curtains?page={}")

1 Ответ

0 голосов
/ 28 апреля 2020
import requests
import csv


data = {
    "operationName": "withSearchResults",
    "query": "query withSearchResults($query: String!, $queryParams: QueryParams, $locale: String!, $country: String!, $currency: String!, $previewTypeIds: [String!], $experience: String) {\n  searchResults(query: $query, queryParams: $queryParams, locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds, experience: $experience) {\n    ...Results\n    ...TrendingResults\n    ...Metadata\n    ...Filters\n    ...Pagination\n    ...LandingPage\n    __typename\n  }\n}\n\nfragment Results on SearchResults {\n  results {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productTypeId\n      productPageUrl\n      blankItemId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    groupId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment TrendingResults on SearchResults {\n  trendingResults {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productPageUrl\n      productTypeId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment Metadata on SearchResults {\n  metadata {\n    title\n    searchContext {\n      category\n      __typename\n    }\n    resultCount\n    topic\n    searchBar {\n      iaCode\n      pillLabel\n      keywords\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Filters on SearchResults {\n  filters {\n    resetUrl\n    staticFilters {\n      type\n      label\n      options {\n        name\n        label\n        applied\n        url\n        options {\n          name\n          label\n          applied\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    filters {\n      type\n      label\n      experiences {\n        name\n        value\n        __typename\n      }\n      options {\n        name\n        label\n        applied\n        disabled\n        url\n        hexColor\n        imageUrl\n        __typename\n      }\n      __typename\n    }\n    appliedCount\n    appliedPath\n    resets {\n      label\n      url\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Pagination on SearchResults {\n  pagination {\n    currentPage\n    perPage\n    showPreviousPageLink\n    showNextPageLink\n    paginationLinks {\n      namedLinks {\n        previousPage {\n          rel\n          url\n          __typename\n        }\n        nextPage {\n          rel\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    fromNumber\n    toNumber\n    total\n    __typename\n  }\n  __typename\n}\n\nfragment LandingPage on SearchResults {\n  metadata {\n    formattedQuery\n    landingPage {\n      hero {\n        pitch\n        title\n        image\n        color\n        __typename\n      }\n      bubbles {\n        title\n        items {\n          title\n          image\n          realisticImage\n          url\n          isExternal\n          __typename\n        }\n        hasImages\n        __typename\n      }\n      seoMetadata {\n        pageDescription\n        robots\n        canonicalURL\n        searchTitle\n        seoImage\n        alternatePageVersions {\n          href\n          locale\n          __typename\n        }\n        relatedTagLinks {\n          title\n          href\n          text\n          __typename\n        }\n        __typename\n      }\n      footer {\n        text\n        breadcrumbs {\n          name\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    relatedTopics {\n      title\n      url\n      __typename\n    }\n    relatedProducts {\n      id\n      url\n      productTitle\n      fullTitle\n      __typename\n    }\n    searchPageType\n    resultCount\n    searchUUID\n    __typename\n  }\n  __typename\n}\n",
    "variables": {"country": "EG", "currency": "USD", "experience": "srp", "locale": "en",
                  "previewTypeIds": ["product_close", "alternate_product_close", "artwork"],
                  "query": "shower-curtains", "queryParams": {"page": 0}
                  }
}


def main(url):
    with requests.Session() as req:
        with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Name", "Price", "IMG"])
            for item in range(1, 11):
                print(f"Extracting Page# {item}")
                data['variables']['queryParams']['page'] = item
                r = req.post(url, json=data).json()
                for item in r['data']['searchResults']['results']:
                    writer.writerow([
                        item['work']['title'],
                        item['inventoryItem']['price']['amount'],
                        item['inventoryItem']['previewSet']['previews'][0]['url']
                    ])


main("https://www.redbubble.com/boom/graphql")

Вывод: просмотр в режиме онлайн

Образец:

enter image description here

...