Скрипт для извлечения данных с сайта очень медленный - PullRequest
0 голосов
/ 19 марта 2020

Я пытаюсь извлечь данные с веб-сайта (в скрипте прокомментирован URL-адрес) кода, который раньше работал нормально, но сейчас он очень и очень медленный. Я думаю, что это может быть из-за веб-драйвера, раньше я использовал фантом js, а теперь без головы Firefox, но я не уверен. Я попытался увеличить / уменьшить время ожидания, чтобы проверить, не вызывает ли это какую-то разницу, но это не так.

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import signal
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
import re
import time
import sys

# url = "https://www.zaubacorp.com/company-list/nic-18/company-type-PTC/listed-Unlisted/status-Active/age-D-company.html"
input_string = input('Enter URL to scrape: ')
url = str(input_string[1:-1])
if url == "":
    print("URL should not be empty, please run again")
    exit()

starttime = time.strftime("%Y-%m-%d")
logFile = open("Log_File_" + starttime + ".txt", "w+")
logFile.write("\nScript Started Time: " + starttime)
logFile.write("\nScraping URL: " + url)
print("Script started on " + starttime)
print("Scraping URL: " + url)

options = Options()
options.headless = True

try:
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//*[@id=\"block-system-main\"]/div[1]/div[2]/span")))
    recordsCntTxt = driver.find_element_by_xpath('//*[@id="block-system-main"]/div[1]/div[2]').text
    recordsCnt = 0
    recordsCntTxtt = re.search('(.*) Companies Found', recordsCntTxt)
    if recordsCntTxtt != "":
        recordsCnt = recordsCntTxtt.group(1)
    print(str(recordsCnt) + ' Companies Found')
    logFile.write("\n" + str(recordsCnt) + ' Companies Found')
    totalPagesCnt = 0
    totalPagesCntEle = re.search('Page\s*1\s*of (.*)', recordsCntTxt)
    if totalPagesCntEle != "":
        totalPagesCnt = totalPagesCntEle.group(1)
    totalPagesCnt = int(totalPagesCnt)
    print("Total Page :" + str(totalPagesCnt))
    logFile.write("\nTotal Page :" + str(totalPagesCnt))
    curPagCnt = 1
    curCompCnt = 0
    complinks = []
    compnames = []
    comploactions = []
    tmpurl = re.sub('-company.html', '/p-pgeInd-company.html', url)
    print("***************************************************************************************")
    print("Getting all companies urls started..")
    print("***************************************************************************************")
    logFile.write("\n***************************************************************************************")
    logFile.write("\nGetting all companies urls started..")
    logFile.write("\n***************************************************************************************")
    logFile.flush()
    while (curPagCnt <= totalPagesCnt):
        try:
            url = re.sub('pgeInd', str(curPagCnt), tmpurl)
            print("Page" + ": " + str(curPagCnt) + " " + url)
            logFile.write("\nPage" + ": " + str(curPagCnt) + " " + url)
            driver.get(url)
            WebDriverWait(driver, 100).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id=\"block-system-main\"]/div[1]/div[2]/span")))
            logFile.flush()
            for i in range(1, len(driver.find_elements_by_xpath('//*[@id="table"]/tbody/tr')) + 1):
                # for i in range(1,2):
                try:
                    tCompanyName = ""
                    tLocation = ""
                    compurl = ""
                    try:
                        compEle = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[2]'.format(i))
                        tCompanyName = compEle.text
                    except:
                        tCompanyName = ""
                    try:
                        compEle = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[2]/a'.format(i))
                        compurl = compEle.get_attribute("href")
                    except:
                        compurl = ""
                    try:
                        tLocation = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[{}]/td[3]'.format(i)).text
                    except:
                        tLocation = ""
                    if tCompanyName != "" and compurl != "" and tLocation != "":
                        complinks.insert(curCompCnt, compurl)
                        compnames.insert(curCompCnt, tCompanyName)
                        comploactions.insert(curCompCnt, tLocation)
                        curCompCnt += 1
                except Exception as e:
                    try:
                        driver.save_screenshot("Warning-1.png")
                        print("Warning Handler1: ")
                        logFile.write("\nWarning Handler1: ")
                        print(str(e))
                        logFile.write("\n" + str(e))
                        exc_type, exc_obj, exc_tb = sys.exc_info()
                        print("\nLine info :" + str(exc_tb.tb_lineno))
                        logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
                        logFile.flush()
                    except:
                        print("")
                start = 0
            curPagCnt += 1
        except Exception as e:
            try:
                driver.save_screenshot("Warning-2.png")
                print("Warning Handler2: ")
                logFile.write("\nWarning Handler2: ")
                print(str(e))
                logFile.write("\n" + str(e))
                exc_type, exc_obj, exc_tb = sys.exc_info()
                print("\nLine info :" + str(exc_tb.tb_lineno))
                logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
                logFile.flush()
            except:
                print("")
    if curCompCnt == 0:
        driver = driver = webdriver.Firefox(options=options)
    driver.close()
    driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
    driver.quit()

    print("Total Companies URL(s) Found: " + str(len(complinks)))
    print("***************************************************************************************")
    print("Getting companies details"
          " started..")
    print("***************************************************************************************")
    logFile.write("\nTotal Companies URL(s) Found: " + str(len(complinks)))
    logFile.write("\n***************************************************************************************")
    logFile.write("\nGetting companies details started..")
    logFile.write("\n***************************************************************************************")
    logFile.flush()
    outputFle = open("zaubacorp-records.csv", "w+")
    outputFle.write('Company Name,Location,Company Age,Email ID,Website,Address,Director(s)')
    outputFle.flush()

    for curCompCnt in range(0, len(complinks)):
        try:
            driver = webdriver.Firefox(options=options)
            driver.get(complinks[curCompCnt])
            fcompname = compnames[curCompCnt]
            floca = comploactions[curCompCnt]
            CompanyAge = ""
            EmailID = ""
            Website = ""
            Address = ""
            Directors = ""
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "block-system-main")))
            # time.sleep(3)
            pgsrc = driver.page_source
            CompanyAgeReg = re.search('<td><p>Age\s*of\s*Company</p></td><td><p>([\w\W]*?)</p></td>', pgsrc)
            EmailIDReg = re.search('<p><b>\s*Email\s*ID:\s*</b>([\w\W]*?)</p>', pgsrc)
            WebsiteReg = re.search('<span\s*id="addwebsitespan>"><a\s*href="(.*)"', pgsrc)
            AddressReg = re.search('<p><b>Address:\s*<\/b><\/p>([\w\W]*?)<\/div', pgsrc)
            try:
                if CompanyAgeReg != "":
                    CompanyAge = CompanyAgeReg.group(1)
            except:
                CompanyAge = ""
            try:
                if EmailIDReg != "":
                    EmailID = EmailIDReg.group(1)
            except:
                EmailID = ""
            try:
                if WebsiteReg != "":
                    Website = WebsiteReg.group(1)
            except:
                Website = ""
            try:
                if AddressReg != "":
                    Address = AddressReg.group(1)
            except:
                Address = ""
            CompanyAge = re.sub("<br />", '', CompanyAge)
            CompanyAge = re.sub("<br>", '', CompanyAge)
            CompanyAge = re.sub("</p>", '', CompanyAge)
            CompanyAge = re.sub("<p>", '', CompanyAge)
            EmailID = re.sub("<br />", '', EmailID)
            EmailID = re.sub("<br>", '', EmailID)
            EmailID = re.sub("</p>", '', EmailID)
            EmailID = re.sub("<p>", '', EmailID)
            Address = re.sub("<br />", '', Address)
            Address = re.sub("<br>", '', Address)
            Address = re.sub("</p>", '', Address)
            Address = re.sub("<p>", '', Address)
            Website = re.sub("<br />", '', Website)
            Website = re.sub("<br>", '', Website)
            Website = re.sub("</p>", '', Website)
            Website = re.sub("<p>", '', Website)
            Website = re.sub("#", '', Website)

            fcompname = re.sub("\"", '', fcompname)
            floca = re.sub("\"", '', floca)
            CompanyAge = re.sub("\"", '', CompanyAge)
            EmailID = re.sub("\"", '', EmailID)
            Website = re.sub("\"", '', Website)
            Address = re.sub("\"", '', Address)
            directorCnt = 0
            try:
                directorCnt = len(driver.find_elements_by_xpath(
                    '//*[@id="block-system-main"]/div[2]/div[1]/div[7]/table/tbody/tr')) - 1
            except:
                directorCnt = 0
            print('Count ' + str(curCompCnt + 1) + ':\tCompany Name: ' + fcompname)
            logFile.write("\nCount: " + str(curCompCnt + 1) + ':\tCompany Name: ' + fcompname)
            for i in range(1, directorCnt):
                director = ""
                try:
                    directorEle = driver.find_element_by_xpath('//*[@id="package{}"]/td[2]/p/a'.format(i))
                    director = directorEle.text
                    director = re.sub("\"", '', director)
                except:
                    director = ""
                if director != "":
                    if i == 1:
                        outputFle.write(
                            '\n\"' + fcompname + '\",\"' + floca + '\",\"' + CompanyAge + '\",\"' + EmailID + '\",\"' + Website + '\",\"' + Address + '\",\"' + director + '\"')
                        print(
                            ' > ' + floca + ' > ' + CompanyAge + ' > ' + EmailID + ' > ' + Website + ' > ' + director + "\n")
                    else:
                        outputFle.write('\n\"' + fcompname + '\",,,,,,\"' + director + '\"')
                        print(' > ' + director + "\n")
                i += 1
            if directorCnt <= 0:
                outputFle.write(
                    '\n\"' + fcompname + '\",\"' + floca + '\",\"' + CompanyAge + '\",\"' + EmailID + '\",\"' + Website + '\",\"' + Address + '\",')
                print(' > ' + floca + ' > ' + CompanyAge + ' > ' + EmailID + ' > ' + Website + "\n")
            outputFle.flush()
            logFile.flush()
            if curCompCnt % 100 == 0:
                driver.close()
                driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                driver.quit()
                driver = webdriver.Firefox(options=options)
        except Exception as e:
            try:
                print("Warning Handler3: ")
                logFile.write("\nWarning Handler3: ")
                print(str(e))
                logFile.write("\n" + str(e))
                exc_type, exc_obj, exc_tb = sys.exc_info()
                print("\nLine info :" + str(exc_tb.tb_lineno))
                logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
                logFile.flush()
                driver.close()
                driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                driver.quit()
                driver = webdriver.Firefox(options=options)
            except:
                print("")
    outputFle.flush()
    outputFle.close()
    endtime = time.strftime("%Y-%m-%d")
    print("\nScript Completed Time:" + endtime)
    logFile.write("\nScript Completed Time:" + endtime)
    logFile.flush()
    logFile.close()
except Exception as e:
    print("Warning Handler4: ")
    logFile.write("\nWarning Handler4: ")
    print(str(e))
    logFile.write("\n" + str(e))
    exc_type, exc_obj, exc_tb = sys.exc_info()
    print("\nLine info :" + str(exc_tb.tb_lineno))
    logFile.write("\nLine info :" + str(exc_tb.tb_lineno))
    logFile.flush()
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...