Вы должны создать более сложные правила для очистки только частей данных в строках.
Сначала вы можете использовать find_elements_by_class_name
(с s
в слове elements
), чтобы получить все элементы с классом sidearm-roster-players-name
и отдельно с классом sidearm-roster-player-position
, sidearm-roster-player-class-hometown
и т. д.
all_names = driver.find_elements_by_class_name('sidearm-roster-player-name')
all_pozitions = driver.find_elements_by_class_name('sidearm-roster-player-position')
all_hometowns = driver.find_elements_by_class_name('sidearm-roster-player-class-hometown')
, а затем вы можете использовать zip()
для создания пар (name, size, hometown, etc.)
for name, position, hometown in zip(all_names, all_positions, all_hometowns):
print(name.text, "|", position.text, "|", hometown.text)
from selenium import webdriver
url = 'https://rolltide.com/roster.aspx?roster=226&path=football'
driver = webdriver.Firefox()
driver.get(url)
all_names = driver.find_elements_by_class_name('sidearm-roster-player-name')
all_positions = driver.find_elements_by_class_name('sidearm-roster-player-position')
all_hometowns = driver.find_elements_by_class_name('sidearm-roster-player-class-hometown')
for name, position, hometown in zip(all_names, all_positions, all_hometowns):
print(name.text, "|", position.text, "|", hometown.text)
Для более подробного анализа вы можете использовать более сложные правила и xpath
(find_elements_by_xpath
).
Вы даже можете сначала очистить все строки, а затем использовать for
-loop для отдельной очистки элементов в каждом ряду.
from selenium import webdriver
import csv
url = 'https://rolltide.com/roster.aspx?roster=226&path=football'
driver = webdriver.Firefox()
driver.get(url)
all_rows = driver.find_elements_by_xpath('//ul[@class="sidearm-roster-players"]//li')
fh = open('output.csv', 'w')
csvwriter = csv.writer(fh)
#write headers
csvwriter.writerow(["Number", "Name", "Position", "Height", "Weight", "Hometown", "Highschool", "Academic year"])
for row in all_rows: #[:10]:
number = row.find_element_by_xpath('.//div[@class="sidearm-roster-player-name"]//span').text
print('number:', number)
name = row.find_element_by_xpath('.//div[@class="sidearm-roster-player-name"]//p').text
#print('name:', name)
position = row.find_element_by_xpath('.//div[@class="sidearm-roster-player-position"]/span').text
#print('position:', position)
height = row.find_element_by_class_name('sidearm-roster-player-height').text
#print('height:', height)
weight = row.find_element_by_class_name('sidearm-roster-player-weight').text
#print('weight:', weight)
# it seems some classes have two elements in row - first probably always is empty but I join all elements
hometown = row.find_elements_by_class_name('sidearm-roster-player-hometown')
hometown = ''.join(x.text for x in hometown)
#print('hometown:', hometown)
highschool = row.find_elements_by_class_name('sidearm-roster-player-highschool')
highschool = ''.join(x.text for x in highschool)
#print('highschool:', highschool)
academic_year = row.find_elements_by_class_name('sidearm-roster-player-academic-year')
academic_year = ''.join(x.text for x in academic_year)
#print('academic_year:', academic_year)
#print('---')
csvwriter.writerow([number, name, position, height, weight, hometown, highschool, academic_year])
fh.close()