Question

поэтому я написал этот код, чтобы получить ссылки на страницы профиля, перейти на страницы профиля и извлечь нужную информацию.

Для некоторых он работает отлично, но потом говорит индекс веб-страницы, и я не знаю почему ...

Я положил туда пару отпечатков, чтобы отслеживать границы индекса / размера границ и не вижу, как его сбрасывать ...

ниже мой код:

from selenium import webdriver
import pyperclip
from bs4 import BeautifulSoup
import time
import pandas
from csv import writer
import sys
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from random import randint
from time import sleep



#Chrome webdriver filepath...Chromedriver version 74
driver = webdriver.Chrome(r'C:\Users\mfoytlin\Desktop\chromedriver.exe')
driver.get('https://www.zillow.com/lender-directory/?sort=Relevance&location=46038&language=English&page=1')
sleep(randint(3, 12))

#Get Page HTML Source
soup = BeautifulSoup(driver.page_source, 'html.parser')

href_links = soup.findAll('div', {'class': 'ld-lender-card ld-lender-card_bank'})
#Gets total Num of pages to go through as well as finds the 'next' button to click
pages = driver.find_element_by_class_name('zsg-pagination')
num_pages_list = pages.find_elements_by_tag_name('li')
next_button = driver.find_element_by_class_name('zsg-pagination-next')

pagecounter = 0

with open(searchLocation + '.csv', 'w') as csv_file:
    csv_writer = writer(csv_file)
    headers = ['Lender Name', 'NMLS Number', 'Business Name', 'License', 'Cell #', 'Office #', 'Street Address', 'Zip code', 'State', 'City', 'Websites', 'Zillow URL']
    csv_writer.writerow(headers)
    #Number of pages to iterate through
    for page in range(int(num_pages_list[-2].text)):
        #Timer/pageCounter to see how fast we are going through each page for testing purposes
        startTime = datetime.now()
        pagecounter = pagecounter + 1
        print("Page number: ", pagecounter, ' ', 'out of: ', num_pages_list[-2].text)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        #To know how many profiles to iterate through on the page
        profile_links = driver.find_elements_by_xpath("//div[@class='ld-lender-info-column']//h2//a")
        for profile in range(len(profile_links)):
            #Need this profile_links here to refresh links on page when starting loop again so no stale args error
            time.sleep(2)
            profile_links = driver.find_elements_by_xpath("//div[@class='ld-lender-info-column']//h2//a")
            #print('index number: ', profile)
            time.sleep(3)
            print('Index: ', profile, 'Range: ', len(profile_links))
            driver.execute_script("arguments[0].click();", profile_links[profile])
            time.sleep(3)

            try:
                show_more_button = driver.find_element_by_class_name('zsg-wrapper-footer').click()
                time.sleep(2)
            except:
                continue
            soup2 = BeautifulSoup(driver.page_source, 'html.parser')

            #Get license info, sometimes this code breaks: could it be because of page not loading instead of coding error?
            print('Index: ', profile, 'Range: ', len(profile_links))
            description_div = soup2.findAll("div", {"class": "zsg-wrapper"})
            find_license = description_div[0].text

            word = 'Nationally registered'
            if word in find_license:
                getlicense = 'Federal'
            else:
                getlicense = 'State'

            try:
                cellnumber = driver.find_element_by_xpath('//dt[contains(text(),"Cell")]/following-sibling::dd/div/span')
                cellNum = cellnumber.text
            except:
                cellNum = 'None'

            try:
                phoneNumber = driver.find_element_by_xpath('//dt[contains(text(),"Office")]/following-sibling::dd/div/span')
                officephoneNum = phoneNumber.text
            except:
                officephoneNum = 'None'

            try:
                lendername = soup2.find('h1', {'class': 'lender-name'})
                lender = lendername.text
            except:
                lender = 'None'

            try:
                nmls = soup2.find('span', {'class': 'nmls-link'})
                nmlsnum = nmls.a.text
            except:
                nmlsnum = 'None'

            try:
                hyperlinkBusName = soup2.find('h4', {'class': 'business-name-section'})
                hyperlinktext = hyperlinkBusName.text
            except:
                hyperlinktext = 'None'

            try:
                trial = soup2.findAll('dl', {'class': 'zsg-list_definition'})
                address = trial[0].dd.text
            except:
                address = 'None'

            try:
                websiteURL = soup2.find('dd', {'class': 'dont-break-out'})
                website = websiteURL.a.text
            except:
                website = 'None'

            zillowURL = driver.current_url
            allinfobox = driver.find_element_by_xpath('//*[@id="zm-profile"]/div/div[3]/div[3]/aside/div/div/dl/div[1]/dd')
            words = allinfobox.text


            nlines = words.count('\n')
            if nlines is 1:
                realwords = words.split('\n')
                bigguy = realwords[1].split()
                bigguy2 = realwords[1].split(',')
                city = bigguy2[0]

                for x in bigguy:
                    if len(x) is 2:
                        state = x

                for item in bigguy:
                    if item.isdigit():
                        if len(item) is 5:
                            result = item
                zipcode = result

                streetaddress = realwords[0]
                csv_writer.writerow([lender, nmlsnum, hyperlinktext, getlicense, cellNum, officephoneNum, streetaddress, zipcode, state, city, website, zillowURL])
                driver.back()
                sleep(randint(4, 6))
            else:
                realwords = words.split('\n')
                bigguy = realwords[2].split()

                bigguy2 = realwords[2].split(',')
                city = bigguy2[0]

                statestuff = realwords[2].split()
                for item in statestuff:
                    if item.isdigit():
                        if len(item) is 5:
                            result = item
                zipcode = result

                #Street Address
                addy1 = realwords[0].split()
                addy2 = realwords[1].split()
                joinedlist = addy1 + addy2

                streetaddress = ' '.join(joinedlist)

                # Get state Abbreviation
                for x in statestuff:
                    if len(x) is 2:
                        state = x

                csv_writer.writerow([lender, nmlsnum, hyperlinktext, getlicense, cellNum, officephoneNum, streetaddress, zipcode, state, city, website, zillowURL])
                time.sleep(3)
                driver.back()
                sleep(randint(3, 5))


        print('Time taken on page:', pagecounter, datetime.now() - startTime)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        pages = driver.find_element_by_class_name('zsg-pagination')
        num_pages_list = pages.find_elements_by_tag_name('li')
        next_button = driver.find_element_by_class_name('zsg-pagination-next')
        next_button.click()
        sleep(randint(4, 10))

driver.execute_script ("arguments [0] .click ();", profile_links [профиль]) продолжает указывать индекс вне диапазона

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

driver.execute_script ("arguments [0] .click ();", profile_links [профиль]) продолжает указывать индекс вне диапазона

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 0 ]

Похожие темы