bs4 img crawler donload img куча в python - PullRequest
0 голосов
/ 24 апреля 2020
import requests
import urllib
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

for idx in range(1, 17):
    url = "https://www.coupang.com/np/categories/311357?page=" + str(idx)

    print(url)
    result = requests.get(url, headers=headers)
    soup_obj = BeautifulSoup(result.content, "html.parser")

    div = soup_obj.findAll("div", {"class": "name"})
    lis = soup_obj.find("ul", {"id": "productList"}).findAll("li")

    for li in lis:
        name = li.find("div", {"class": "name"})
        img = li.find("dt", {"class": "image"}).find("img", {"src": ""})

        print("name: " + name.text.strip())
        urllib.request.urlretrieve(img, "./imagepile")

// urllib.request.urlretrieve (img, "./imagepile") как исправить эту строку кода? пл

1 Ответ

1 голос
/ 24 апреля 2020
import requests
from bs4 import BeautifulSoup


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}



def main(url):
    with requests.Session() as req:
        for item in range(1, 18):
            print(f"Extracting Page# {item}")
            r = req.get(url.format(item), headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            for item in soup.select("dt.image"):
                print(item.img['alt'], f"https:{item.img['src']}")


main("https://www.coupang.com/np/categories/311357?page={}")

Скачать версию:

import requests
from bs4 import BeautifulSoup


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}


def main(url):
    with requests.Session() as req:
        for item in range(1, 3):
            print(f"Extracting Page# {item}")
            r = req.get(url.format(item), headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = [[item.img['alt'], f'https:{item.img["src"]}']
                      for item in soup.select("dt.image")]
            for el in target:
                print(f"Saving {el[0]}.jpg")
                r = req.get(el[1])
                with open(f"{el[0]}.jpg", 'wb') as f:
                    f.write(r.content)


main("https://www.coupang.com/np/categories/311357?page={}")
...