Я пытаюсь извлечь данные с веб-сайта (в скрипте прокомментирован URL-адрес) кода, который раньше работал нормально, но сейчас он очень и очень медленный. Я думаю, что это может быть из-за веб-драйвера, раньше я использовал фантом js, а теперь без головы Firefox, но я не уверен. Я попытался увеличить / уменьшить время ожидания, чтобы проверить, не вызывает ли это какую-то разницу, но это не так.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import signal
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
import re
import time
import sys
# url = "https://www.zaubacorp.com/company-list/nic-18/company-type-PTC/listed-Unlisted/status-Active/age-D-company.html"
input_string = input('Enter URL to scrape: ')
url = str(input_string[1:-1])
if url == "":
print("URL should not be empty, please run again")
exit()
starttime = time.strftime("%Y-%m-%d")
logFile = open("Log_File_" + starttime + ".txt", "w+")
logFile.write("\nScript Started Time: " + starttime)
logFile.write("\nScraping URL: " + url)
print("Script started on " + starttime)
print("Scraping URL: " + url)
options = Options()
options.headless = True
try:
driver = webdriver.Firefox(options=options)
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//*[@id=\"block-system-main\"]/div[1]/div[2]/span")))
recordsCntTxt = driver.find_element_by_xpath('//*[@id="block-system-main"]/div[1]/div[2]').text
recordsCnt = 0
recordsCntTxtt = re.search('(.*) Companies Found', recordsCntTxt)
if recordsCntTxtt != "":
recordsCnt = recordsCntTxtt.group(1)
print(str(recordsCnt) + ' Companies Found')
logFile.write("\n" + str(recordsCnt) + ' Companies Found')
totalPagesCnt = 0
totalPagesCntEle = re.search('Page\s*1\s*of (.*)', recordsCntTxt)
if totalPagesCntEle != "":
totalPagesCnt = totalPagesCntEle.group(1)
totalPagesCnt = int(totalPagesCnt)
print("Total Page :" + str(totalPagesCnt))
logFile.write("\nTotal Page :" + str(totalPagesCnt))
curPagCnt = 1
curCompCnt = 0
complinks = []
compnames = []
comploactions = []
tmpurl = re.sub('-company.html', '/p-pgeInd-company.html', url)
print("***************************************************************************************")
print("Getting all companies urls started..")
print("***************************************************************************************")
logFile.write("\n***************************************************************************************")
logFile.write("\nGetting all companies urls started..")
logFile.write("\n***************************************************************************************")
logFile.flush()
while (curPagCnt <= totalPagesCnt):
try:
url = re.sub('pgeInd', str(curPagCnt), tmpurl)
print("Page" + ": " + str(curPagCnt) + " " + url)
logFile.write("\nPage" + ": " + str(curPagCnt) + " " + url)
driver.get(url)
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.XPATH, "//*[@id=\"block-system-main\"]/div[1]/div[2]/span")))
logFile.flush()
for i in range(1, len(driver.find_elements_by_xpath('//*[@id="table"]/tbody/tr')) + 1):
# for i in range(1,2):
try:
tCompanyName = ""
tLocation = ""
compurl = ""
try:
compEle = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[2]'.format(i))
tCompanyName = compEle.text
except:
tCompanyName = ""
try:
compEle = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[2]/a'.format(i))
compurl = compEle.get_attribute("href")
except:
compurl = ""
try:
tLocation = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[3]'.format(i)).text
except:
tLocation = ""
if tCompanyName != "" and compurl != "" and tLocation != "":
complinks.insert(curCompCnt, compurl)
compnames.insert(curCompCnt, tCompanyName)
comploactions.insert(curCompCnt, tLocation)
curCompCnt += 1
except Exception as e:
try:
driver.save_screenshot("Warning-1.png")
print("Warning Handler1: ")
logFile.write("\nWarning Handler1: ")
print(str(e))
logFile.write("\n" + str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("\nLine info :" + str(exc_tb.tb_lineno))
logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
logFile.flush()
except:
print("")
start = 0
curPagCnt += 1
except Exception as e:
try:
driver.save_screenshot("Warning-2.png")
print("Warning Handler2: ")
logFile.write("\nWarning Handler2: ")
print(str(e))
logFile.write("\n" + str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("\nLine info :" + str(exc_tb.tb_lineno))
logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
logFile.flush()
except:
print("")
if curCompCnt == 0:
driver = driver = webdriver.Firefox(options=options)
driver.close()
driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc
driver.quit()
print("Total Companies URL(s) Found: " + str(len(complinks)))
print("***************************************************************************************")
print("Getting companies details"
" started..")
print("***************************************************************************************")
logFile.write("\nTotal Companies URL(s) Found: " + str(len(complinks)))
logFile.write("\n***************************************************************************************")
logFile.write("\nGetting companies details started..")
logFile.write("\n***************************************************************************************")
logFile.flush()
outputFle = open("zaubacorp-records.csv", "w+")
outputFle.write('Company Name,Location,Company Age,Email ID,Website,Address,Director(s)')
outputFle.flush()
for curCompCnt in range(0, len(complinks)):
try:
driver = webdriver.Firefox(options=options)
driver.get(complinks[curCompCnt])
fcompname = compnames[curCompCnt]
floca = comploactions[curCompCnt]
CompanyAge = ""
EmailID = ""
Website = ""
Address = ""
Directors = ""
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "block-system-main")))
# time.sleep(3)
pgsrc = driver.page_source
CompanyAgeReg = re.search('<td><p>Age\s*of\s*Company</p></td><td><p>([\w\W]*?)</p></td>', pgsrc)
EmailIDReg = re.search('<p><b>\s*Email\s*ID:\s*</b>([\w\W]*?)</p>', pgsrc)
WebsiteReg = re.search('<span\s*id="addwebsitespan>"><a\s*href="(.*)"', pgsrc)
AddressReg = re.search('<p><b>Address:\s*<\/b><\/p>([\w\W]*?)<\/div', pgsrc)
try:
if CompanyAgeReg != "":
CompanyAge = CompanyAgeReg.group(1)
except:
CompanyAge = ""
try:
if EmailIDReg != "":
EmailID = EmailIDReg.group(1)
except:
EmailID = ""
try:
if WebsiteReg != "":
Website = WebsiteReg.group(1)
except:
Website = ""
try:
if AddressReg != "":
Address = AddressReg.group(1)
except:
Address = ""
CompanyAge = re.sub("<br />", '', CompanyAge)
CompanyAge = re.sub("<br>", '', CompanyAge)
CompanyAge = re.sub("</p>", '', CompanyAge)
CompanyAge = re.sub("<p>", '', CompanyAge)
EmailID = re.sub("<br />", '', EmailID)
EmailID = re.sub("<br>", '', EmailID)
EmailID = re.sub("</p>", '', EmailID)
EmailID = re.sub("<p>", '', EmailID)
Address = re.sub("<br />", '', Address)
Address = re.sub("<br>", '', Address)
Address = re.sub("</p>", '', Address)
Address = re.sub("<p>", '', Address)
Website = re.sub("<br />", '', Website)
Website = re.sub("<br>", '', Website)
Website = re.sub("</p>", '', Website)
Website = re.sub("<p>", '', Website)
Website = re.sub("#", '', Website)
fcompname = re.sub("\"", '', fcompname)
floca = re.sub("\"", '', floca)
CompanyAge = re.sub("\"", '', CompanyAge)
EmailID = re.sub("\"", '', EmailID)
Website = re.sub("\"", '', Website)
Address = re.sub("\"", '', Address)
directorCnt = 0
try:
directorCnt = len(driver.find_elements_by_xpath(
'//*[@id="block-system-main"]/div[2]/div[1]/div[7]/table/tbody/tr')) - 1
except:
directorCnt = 0
print('Count ' + str(curCompCnt + 1) + ':\tCompany Name: ' + fcompname)
logFile.write("\nCount: " + str(curCompCnt + 1) + ':\tCompany Name: ' + fcompname)
for i in range(1, directorCnt):
director = ""
try:
directorEle = driver.find_element_by_xpath('//*[@id="package{}"]/td[2]/p/a'.format(i))
director = directorEle.text
director = re.sub("\"", '', director)
except:
director = ""
if director != "":
if i == 1:
outputFle.write(
'\n\"' + fcompname + '\",\"' + floca + '\",\"' + CompanyAge + '\",\"' + EmailID + '\",\"' + Website + '\",\"' + Address + '\",\"' + director + '\"')
print(
' > ' + floca + ' > ' + CompanyAge + ' > ' + EmailID + ' > ' + Website + ' > ' + director + "\n")
else:
outputFle.write('\n\"' + fcompname + '\",,,,,,\"' + director + '\"')
print(' > ' + director + "\n")
i += 1
if directorCnt <= 0:
outputFle.write(
'\n\"' + fcompname + '\",\"' + floca + '\",\"' + CompanyAge + '\",\"' + EmailID + '\",\"' + Website + '\",\"' + Address + '\",')
print(' > ' + floca + ' > ' + CompanyAge + ' > ' + EmailID + ' > ' + Website + "\n")
outputFle.flush()
logFile.flush()
if curCompCnt % 100 == 0:
driver.close()
driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc
driver.quit()
driver = webdriver.Firefox(options=options)
except Exception as e:
try:
print("Warning Handler3: ")
logFile.write("\nWarning Handler3: ")
print(str(e))
logFile.write("\n" + str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("\nLine info :" + str(exc_tb.tb_lineno))
logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
logFile.flush()
driver.close()
driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc
driver.quit()
driver = webdriver.Firefox(options=options)
except:
print("")
outputFle.flush()
outputFle.close()
endtime = time.strftime("%Y-%m-%d")
print("\nScript Completed Time:" + endtime)
logFile.write("\nScript Completed Time:" + endtime)
logFile.flush()
logFile.close()
except Exception as e:
print("Warning Handler4: ")
logFile.write("\nWarning Handler4: ")
print(str(e))
logFile.write("\n" + str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("\nLine info :" + str(exc_tb.tb_lineno))
logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
logFile.flush()