поэтому я написал этот код, чтобы получить ссылки на страницы профиля, перейти на страницы профиля и извлечь нужную информацию.
Для некоторых он работает отлично, но потом говорит индекс веб-страницы, и я не знаю почему ...
Я положил туда пару отпечатков, чтобы отслеживать границы индекса / размера границ и не вижу, как его сбрасывать ...
ниже мой код:
from selenium import webdriver
import pyperclip
from bs4 import BeautifulSoup
import time
import pandas
from csv import writer
import sys
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from random import randint
from time import sleep
#Chrome webdriver filepath...Chromedriver version 74
driver = webdriver.Chrome(r'C:\Users\mfoytlin\Desktop\chromedriver.exe')
driver.get('https://www.zillow.com/lender-directory/?sort=Relevance&location=46038&language=English&page=1')
sleep(randint(3, 12))
#Get Page HTML Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
href_links = soup.findAll('div', {'class': 'ld-lender-card ld-lender-card_bank'})
#Gets total Num of pages to go through as well as finds the 'next' button to click
pages = driver.find_element_by_class_name('zsg-pagination')
num_pages_list = pages.find_elements_by_tag_name('li')
next_button = driver.find_element_by_class_name('zsg-pagination-next')
pagecounter = 0
with open(searchLocation + '.csv', 'w') as csv_file:
csv_writer = writer(csv_file)
headers = ['Lender Name', 'NMLS Number', 'Business Name', 'License', 'Cell #', 'Office #', 'Street Address', 'Zip code', 'State', 'City', 'Websites', 'Zillow URL']
csv_writer.writerow(headers)
#Number of pages to iterate through
for page in range(int(num_pages_list[-2].text)):
#Timer/pageCounter to see how fast we are going through each page for testing purposes
startTime = datetime.now()
pagecounter = pagecounter + 1
print("Page number: ", pagecounter, ' ', 'out of: ', num_pages_list[-2].text)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#To know how many profiles to iterate through on the page
profile_links = driver.find_elements_by_xpath("//div[@class='ld-lender-info-column']//h2//a")
for profile in range(len(profile_links)):
#Need this profile_links here to refresh links on page when starting loop again so no stale args error
time.sleep(2)
profile_links = driver.find_elements_by_xpath("//div[@class='ld-lender-info-column']//h2//a")
#print('index number: ', profile)
time.sleep(3)
print('Index: ', profile, 'Range: ', len(profile_links))
driver.execute_script("arguments[0].click();", profile_links[profile])
time.sleep(3)
try:
show_more_button = driver.find_element_by_class_name('zsg-wrapper-footer').click()
time.sleep(2)
except:
continue
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
#Get license info, sometimes this code breaks: could it be because of page not loading instead of coding error?
print('Index: ', profile, 'Range: ', len(profile_links))
description_div = soup2.findAll("div", {"class": "zsg-wrapper"})
find_license = description_div[0].text
word = 'Nationally registered'
if word in find_license:
getlicense = 'Federal'
else:
getlicense = 'State'
try:
cellnumber = driver.find_element_by_xpath('//dt[contains(text(),"Cell")]/following-sibling::dd/div/span')
cellNum = cellnumber.text
except:
cellNum = 'None'
try:
phoneNumber = driver.find_element_by_xpath('//dt[contains(text(),"Office")]/following-sibling::dd/div/span')
officephoneNum = phoneNumber.text
except:
officephoneNum = 'None'
try:
lendername = soup2.find('h1', {'class': 'lender-name'})
lender = lendername.text
except:
lender = 'None'
try:
nmls = soup2.find('span', {'class': 'nmls-link'})
nmlsnum = nmls.a.text
except:
nmlsnum = 'None'
try:
hyperlinkBusName = soup2.find('h4', {'class': 'business-name-section'})
hyperlinktext = hyperlinkBusName.text
except:
hyperlinktext = 'None'
try:
trial = soup2.findAll('dl', {'class': 'zsg-list_definition'})
address = trial[0].dd.text
except:
address = 'None'
try:
websiteURL = soup2.find('dd', {'class': 'dont-break-out'})
website = websiteURL.a.text
except:
website = 'None'
zillowURL = driver.current_url
allinfobox = driver.find_element_by_xpath('//*[@id="zm-profile"]/div/div[3]/div[3]/aside/div/div/dl/div[1]/dd')
words = allinfobox.text
nlines = words.count('\n')
if nlines is 1:
realwords = words.split('\n')
bigguy = realwords[1].split()
bigguy2 = realwords[1].split(',')
city = bigguy2[0]
for x in bigguy:
if len(x) is 2:
state = x
for item in bigguy:
if item.isdigit():
if len(item) is 5:
result = item
zipcode = result
streetaddress = realwords[0]
csv_writer.writerow([lender, nmlsnum, hyperlinktext, getlicense, cellNum, officephoneNum, streetaddress, zipcode, state, city, website, zillowURL])
driver.back()
sleep(randint(4, 6))
else:
realwords = words.split('\n')
bigguy = realwords[2].split()
bigguy2 = realwords[2].split(',')
city = bigguy2[0]
statestuff = realwords[2].split()
for item in statestuff:
if item.isdigit():
if len(item) is 5:
result = item
zipcode = result
#Street Address
addy1 = realwords[0].split()
addy2 = realwords[1].split()
joinedlist = addy1 + addy2
streetaddress = ' '.join(joinedlist)
# Get state Abbreviation
for x in statestuff:
if len(x) is 2:
state = x
csv_writer.writerow([lender, nmlsnum, hyperlinktext, getlicense, cellNum, officephoneNum, streetaddress, zipcode, state, city, website, zillowURL])
time.sleep(3)
driver.back()
sleep(randint(3, 5))
print('Time taken on page:', pagecounter, datetime.now() - startTime)
soup = BeautifulSoup(driver.page_source, 'html.parser')
pages = driver.find_element_by_class_name('zsg-pagination')
num_pages_list = pages.find_elements_by_tag_name('li')
next_button = driver.find_element_by_class_name('zsg-pagination-next')
next_button.click()
sleep(randint(4, 10))