Вот решение, которое разделяет работу между получением данных, извлечением и записью результата. Это также оставляет задачу записи данных csv в модуль csv
.
import csv
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
def extract_brand(c):
"""Locate brand
At most 3 words in brand
"""
tag = c.find('img', class_='lazy-img')
tmp = tag.get('alt')
m = re.match(r'(\w+\s?){1,3}', tmp)
brand = m.group(0).rstrip() if m else 'No Brand Found'
return brand
def extract_product(c):
title_container = c.find('a', class_='item-title')
product_name = title_container.string
return product_name.replace(',', '|').strip()
def extract_price(c):
price_container = c.find('li', class_='price-current')
tmp = price_container.string
if not tmp:
tmp = ''.join(price_container.stripped_strings)
m = re.match(r'(.\d[\d.,]+)', tmp.strip())
product_price = m.group(0) if m else "?"
return product_price
def extract_from(page):
"""Extract data for each product
Return a list containing data for one product per list item.
"""
containers = page.find_all('div', class_='item-container')
data = []
for container in containers:
item = []
item.append(extract_brand(container))
item.append(extract_product(container))
item.append(extract_price(container))
data.append(item)
return data
def write2csv(filename, data):
with open(filename, 'w', newline='') as csvfile:
fd = csv.writer(csvfile)
headers = ["brand", "product_name", "product_price"]
fd.writerow(headers)
fd.writerows(data)
def get_html_from(url, parser='html.parser'):
with uReq(url) as uClient:
page_html = uClient.read()
page_soup = BeautifulSoup(page_html, parser)
return page_soup
my_url = ('https://www.newegg.com/global/lt-en/Video-Cards-Video-Devices/'
'Category/ID-38?Tpk=graphic%20card')
page = get_html_from(my_url)
data = extract_from(page)
filename = "1234.csv"
write2csv(filename, data)