Это код, который я написал, чтобы собрать все отзывы о конкретном продукте и сохранить его в CSV-файле.
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import requests
from random import randint
import urllib.parse
class AmazonReviews():
def __init__(self):
self.csv_data = []
self.first_three = {}
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def set_sleep_time(self):
sleep_time = randint(1,20)
time.sleep(sleep_time)
def open_url(self,url):
values = {}
data = urllib.parse.urlencode(values).encode('utf-8')
req = urllib.request.Request(url, data, self.headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def save_data(self):
with open("data.csv",'a+') as f:
for row in self.csv_data:
f.write(str(row) + '\n')
def fetch_url(self,url):
html = self.open_url(url)
bsObj = BeautifulSoup(html)
print(bsObj.prettify())
self.first_three["asin"] = bsObj.find("div",{"class":"column col2 "}).find("td",{"class":"value"}).getText()
self.first_three["product_name"] = bsObj.find("span",{"id":"productTitle"}).getText().strip()
self.first_three["brand_name"] = self.first_three["product_name"].split(" ")[0]
all_reviews_link = bsObj.find("a",{"data-hook":"see-all-reviews-link-foot"})["href"]
self.set_sleep_time()
self.fetch_reviews(all_reviews_link)
def fetch_reviews(self,all_reviews_link):
url = "https://www.amazon.in" + all_reviews_link
html = self.open_url(url)
bsObj = BeautifulSoup(html)
reviews = bsObj.findAll("div",{'class':'a-section celwidget'})
reviews = reviews[1::]
next_page_link = ""
row = {}
for review in reviews:
row.update(self.first_three)
row["user_id"] = review.find("a",{"class":"a-profile"})["href"]
# print(row["user_id"])
row["rating"] = review.find("i",{"data-hook":"review-star-rating"}).find("span").getText()
# print( row["rating"])
row["title"] = review.find("a",{"data-hook":"review-title"}).find("span").getText()
# print(row["title"])
row["date_of_review"] = review.find("span",{"data-hook":"review-date"}).getText()
# print(row["date_of_review"])
row["verified"] = review.find("span",{"data-hook":"avp-badge"})
if row["verified"]:
row["verified"]=row["verified"].getText()
else:
row["verified"]="Unverified"
# print(row["verified"])
row["review_text"] = review.find("span",{"data-hook":"review-body"}).find("span").getText()
# print(row["review_text"])
row["helpful"] = review.find("span",{"data-hook":"helpful-vote-statement"})
if row["helpful"]:
row["helpful"]=row["helpful"].getText()
else:
row["helpful"] = 0
# print(row["helpful"])
self.csv_data.append(row)
self.set_sleep_time()
next_page_link = bsObj.find("li",{"class":"a-last"}).find("a")["href"]
if next_page_link is not None:
self.fetch_reviews(next_page_link)
else:
self.save_data()
review = AmazonReviews()
review.fetch_url("https://www.amazon.in/Test-Exclusive-606/dp/B07HGJK535/ref=sr_1_1?keywords=oneplus&qid=1582639903&sr=8-1")
Сначала код работал нормально, но внезапно Amazon, похоже, заблокировал мне очистку дальше Я попытался напечатать bsObj
, который дает мне все вместе другую страницу.
Пожалуйста, предложите мне способы, как избежать блокировки
Я очищаю веб-сайт Amazon для проекта ML колледжа
Спасибо за любую помощь!