проблема: я не могу соскрести детали с других страниц, кроме первой. когда я нажимаю на вторую страницу, URL не меняется. так что я возвращаюсь на первую c первую страницу, когда пытаюсь почистить детали продукта.
Итак, код, который я использую, был размещен ниже для вашей справки. Любые выводы будут оценены. Потому что я застрял с этим довольно долго, и это расстраивает. Вот код
import CSV
import requests
from bs4 import BeautifulSoup
def get_page(url_1):
response = requests.get(url_1)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
#print(soup)
return soup
def get_detail_data(soup):
try:
title = soup.find("h1",{"class":"titleItems_head"}).text.strip()
#print(title)
ModelNumber = soup.find("span",{"class":"titleItems_model_digit"}).text
#container= soup.findAll("div", {"class":"speci_area"})
#print(len(container))
#for contain in container:
#TV_Technology = contain.findAll("div",{"class":"col9 col-sm7"})[0].text.strip()
#screen_size = contain.findAll("div",{"class":"col9 col-sm7"})[1].text
#Width = soup.find('div', class_='speci_area').find('div', class_ = 'row').find('div', class_ = 'col9 col-sm7').text
row_array1 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row')
row_array2 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row bg_row')
checker = False
for x in (row_array1 or row_array2):
#print("x is "+x.find('div', class_ = 'col3 col-sm5').text)
if x.find('div', class_ = 'col3 col-sm5').text.strip() == 'Width (mm):':
checker = True
#print("Width is "+x.find('div', class_ = 'col9 col-sm7').text)
Width = x.find('div', class_ = 'col9 col-sm7').text
if checker == False:
Width = None
row_array3 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row')
row_array4 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row bg_row')
checker = False
for y in (row_array3 or row_array4):
#print("x is "+x.find('div', class_ = 'col3 col-sm5').text)
if y.find('div', class_ = 'col3 col-sm5').text.strip() == 'Depth (mm):':
checker = True
#print("Width is "+x.find('div', class_ = 'col9 col-sm7').text)
Depth = y.find('div', class_ = 'col9 col-sm7').text
if checker == False:
Width = None
row_array5 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row')
row_array6 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row bg_row')
checker = False
for z in (row_array5 or row_array6):
#print("x is "+x.find('div', class_ = 'col3 col-sm5').text)
if z.find('div', class_ = 'col3 col-sm5').text.strip() == 'Height (mm):':
checker = True
#print("Width is "+x.find('div', class_ = 'col9 col-sm7').text)
Height = z.find('div', class_ = 'col9 col-sm7').text
if checker == False:
Width = None
row_array7 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row')
row_array8 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row bg_row')
checker = False
for b in (row_array7 or row_array8):
#print("x is "+x.find('div', class_ = 'col3 col-sm5').text)
if b.find('div', class_ = 'col3 col-sm5').text.strip() == 'VESA Wall Mount Size (mm):':
checker = True
#print("Width is "+x.find('div', class_ = 'col9 col-sm7').text)
Vesa_wallmount = b.find('div', class_ = 'col9 col-sm7').text
if checker == False:
Width = None
row_array9 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row')
row_array10 = soup.find('div', class_='speci_area').findAll('div', class_ = 'row bg_row')
checker = False
for c in (row_array9 or row_array10):
#print("x is "+x.find('div', class_ = 'col3 col-sm5').text)
if c.find('div', class_ = 'col3 col-sm5').text.strip() == 'Weight with stand (without stand):':
checker = True
#print("Width is "+x.find('div', class_ = 'col9 col-sm7').text)
Weight = c.find('div', class_ = 'col9 col-sm7').text
if checker == False:
Width = None
data = {
'title' : title,
'ModelNumber' : ModelNumber,
'Width': Width,
'Depth': Depth,
'Height':Height,
'Vesa_wallmount':Vesa_wallmount,
'Weight':Weight
}
#print(data)
#print("test")
return data
except:
title = ''
def get_index_data(soup):
try:
#links = soup.findAll("a",{"class":"disp-block"})
links = soup.findAll('a',class_='disp-block')
except:
links = []
urls = [item.get('href') for item in links]
#print(urls)
return urls
def write_csv(data,url_1):
header = ['productTitle', 'productModelNumber', 'productWidth', 'productDepth', 'productHeight', 'productVesawallSize','productWeight']
row = [data['title'], data['ModelNumber'], data['Width'], data['Depth'], data['Height'], data['Vesa_wallmount'], data['Weight'], url_1]
with open('outputlatest29.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(i for i in header)
for j in row:
writer.writerow(row)
def main():
url_1 = 'https://www.thegoodguys.com.au/televisions/all-tvs#facet:&productBeginIndex:0&orderBy:&pageView:grid&minPrice:&maxPrice:&pageSize:100&'
products = get_index_data(get_page(url_1))
#print(products)
for link in products:
data = get_detail_data(get_page(link))
if data is not None:
#print("test")
write_csv(data, link)
if __name__ == '__main__':
main()