Я пытаюсь удалить данные со всех URL-адресов перечисленных категорий на домашней странице (готово), а также на других страницах подкатегорий с веб-сайта и его ссылок на страницы. URL-адрес здесь
Я создал скрипт Python для того же самого, чтобы извлечь данные в модульной структуре так, как мне нужно. Вывод всех URL-адресов от одного шага к другому в отдельном файле. Но сейчас я сталкиваюсь с проблемой извлечения всех URL страниц, с которых впоследствии будут извлекаться данные. Кроме того, вместо данных со всех перечисленных URL-адресов подкатегорий я получаю данные только по URL-адресу первой подкатегории.
Например, в моем скрипте ниже данные из >>>>>
Общая практика (страница главной категории) - http://www.medicalexpo.com/cat/general-practice-K.html и далее Стетоскоп (страница подкатегории) - http://www.medicalexpo.com/medical-manufacturer/stethoscope-2.html
только грядет. Мне нужны данные из всех перечисленных ссылок подкатегорий, как указано по этой ссылке
Буду признателен за любую помощь, чтобы получить желаемый результат, содержащий URL-адреса ПРОДУКТА со всех перечисленных страниц подкатегории.
Ниже приведен код:
import re
import time
import random
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import html
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import os
from fake_useragent import UserAgent
# Function to write data to a file:
def write_to_file(file,mode, data, newline=None, with_tab=None): #**
with open(file, mode, encoding='utf-8') as l:
if with_tab == True:
data = ''.join(data)
if newline == True:
data = data+'\n'
l.write(data)
# Function for data from Module 1:
def send_link(link1):
browser = webdriver.Chrome()
browser.get(link1)
current_page = browser.current_url
print (current_page)
soup = BeautifulSoup(browser.page_source,"lxml")
tree = html.fromstring(str(soup))
# Added try and except in order to skip/pass attributes without any value.
try:
main_category_url = browser.find_elements_by_xpath("//li[@class=\"univers-group-item\"]/span/a[1][@href]")
main_category_url = [i.get_attribute("href") for i in main_category_url[4:]]
print(len(main_category_url))
except NoSuchElementException:
main_category_url = ''
for index, data in enumerate(main_category_url):
with open('Module_1_OP.tsv', 'a', encoding='utf-8') as outfile:
data = (main_category_url[index] + "\n")
outfile.write(data)
# Data Extraction for Categories under HEADERS:
try:
sub_category_url = browser.find_elements_by_xpath("//li[@class=\"category-group-item\"]/a[1][@href]")
sub_category_url = [i.get_attribute("href") for i in sub_category_url[:]]
print(len(sub_category_url))
except NoSuchElementException:
sub_category_url = ''
for index, data in enumerate(sub_category_url):
with open('Module_1_OP.tsv', 'a', encoding='utf-8') as outfile:
data = (sub_category_url[index] + "\n")
outfile.write(data)
csvfile = open("Module_1_OP.tsv")
csvfilelist = csvfile.readlines()
send_link2(csvfilelist)
# Function for data from Module 2:
def send_link2(links2):
browser = webdriver.Chrome()
start = 7
end = 10
for link2 in (links2[start:end]):
print(link2)
ua = UserAgent()
try:
ua = UserAgent()
except FakeUserAgentError:
pass
ua.random == 'Chrome'
proxies = []
t0 = time.time()
response_delay = time.time() - t0
time.sleep(10*response_delay)
time.sleep(random.randint(2,5))
browser.get(link2)
current_page = browser.current_url
print (current_page)
soup = BeautifulSoup(browser.page_source,"lxml")
tree = html.fromstring(str(soup))
# Added try and except in order to skip/pass attributes without value.
try:
product_url = browser.find_elements_by_xpath('//ul[@class=\"category-grouplist\"]/li/a[1][@href]')
product_url = [i.get_attribute("href") for i in product_url]
print(len(product_url))
except NoSuchElementException:
product_url = ''
try:
product_title = browser.find_elements_by_xpath("//ul[@class=\"category-grouplist\"]/li/a[1][@href]") # Use FindelementS for extracting multiple section data
product_title = [i.text for i in product_title[:]]
print(product_title)
except NoSuchElementException:
product_title = ''
for index, data2 in enumerate(product_title):
with open('Module_1_2_OP.tsv', 'a', encoding='utf-8') as outfile:
data2 = (current_page + "\t" + product_url[index] + "\t" + product_title[index] + "\n")
outfile.write(data2)
for index, data3 in enumerate(product_title):
with open('Module_1_2_OP_URL.tsv', 'a', encoding='utf-8') as outfile:
data3 = (product_url[index] + "\n")
outfile.write(data3)
csvfile = open("Module_1_2_OP_URL.tsv")
csvfilelist = csvfile.readlines()
send_link3(csvfilelist)
# Function for data from Module 3:
def send_link3(csvfilelist):
browser = webdriver.Chrome()
for link3 in csvfilelist[:3]:
print(link3)
browser.get(link3)
time.sleep(random.randint(2,5))
current_page = browser.current_url
print (current_page)
soup = BeautifulSoup(browser.page_source,"lxml")
tree = html.fromstring(str(soup))
try:
pagination = browser.find_elements_by_xpath("//div[@class=\"pagination-wrapper\"]/a[@href]")
pagination = [i.get_attribute("href") for i in pagination]
print(pagination)
except NoSuchElementException:
pagination = ''
for index, data2 in enumerate(pagination):
with open('Module_1_2_3_OP.tsv', 'a', encoding='utf-8') as outfile:
data2 = (current_page + "\n" + pagination[index] + "\n")
outfile.write(data2)
dataset = open("Module_1_2_3_OP.tsv")
dataset_dup = dataset.readlines()
duplicate(dataset_dup)
# Used to remove duplicate records from a List:
def duplicate(dataset):
dup_items = set()
uniq_items = []
for x in dataset:
if x not in dup_items:
uniq_items.append(x)
dup_items.add(x)
write_to_file('Listing_pagination_links.tsv','w', dup_items, newline=True, with_tab=True)
csvfile = open("Listing_pagination_links.tsv")
csvfilelist = csvfile.readlines()
send_link4(csvfilelist)
# Function for data from Module 4:
def send_link4(links3):
browser = webdriver.Chrome()
for link3 in links3:
print(link3)
browser.get(link3)
t0 = time.time()
response_delay = time.time() - t0
time.sleep(10*response_delay)
time.sleep(random.randint(2,5))
sub_category_page = browser.current_url
print (sub_category_page)
soup = BeautifulSoup(browser.page_source,"lxml")
tree = html.fromstring(str(soup))
# Added try and except in order to skip/pass attributes without value.
try:
product_url1 = browser.find_elements_by_xpath('//div[@class=\"inset-caption price-container\"]/a[1][@href]')
product_url1 = [i.get_attribute("href") for i in product_url1]
print(len(product_url1))
except NoSuchElementException:
product_url1 = ''
for index, data in enumerate(product_url1):
with open('Final_Output_' + datestring + '.tsv', 'a', encoding='utf-8') as outfile:
data = (sub_category_page + "\t" + product_url1[index] + "\n")
outfile.write(data)
# PROGRAM STARTS EXECUTING FROM HERE...
# Added to attach Real Date and Time field to Output filename
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M-%S') # For filename
#datestring2 = datetime.strftime(datetime.now(), '%H-%M-%S') # For each record
send_link("http://www.medicalexpo.com/")