Я пытаюсь определить наличие товара на Amazon. Почему этот код не работает?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[@id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Когда я его выполняю, появляется эта ошибка:
Файл "scra.py", строка 17, в проверке
do c = html .fromstring (page.content)
AttributeError: у объекта 'unicode' нет атрибута 'fromstring'
Пожалуйста, помогите мне. Я уже пытался преобразовать page
в pagedata = page.json()
, но это только ухудшило ситуацию.