У меня есть скрипт, который я использовал для удаления веб-страниц для получения определенной информации. Сценарий приведен ниже, и он работает нормально.
Я беру URL-адреса, по которым нужно искать из Postgres DB. После того, как URL-адрес был найден, и он не дал никаких результатов, я перемещаю этот URL-адрес в таблицу, которая используется в качестве очереди. Как только все URL-адреса найдены, программа завершает работу, и мне нужно, чтобы он снова начал поиск URL-адресов в очереди.
Но я столкнулся с одной проблемой, хотя после печати времени "print (" ---% s секунд --- "% (time.time () - start_time))", start_again () ".
Остановлено, не было перезапущено.
Может кто-нибудь, пожалуйста, направьте меня в этом и дайте мне знать, если что-нибудь еще, что мне нужно улучшить в этом коде?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import glob
import datetime
import time
from time import sleep
start_time = time.time()
final_results=[]
positions=[]
saerched_url=[]
def start_again():
print('Execution started again.....')
pass
#def db_connect():
try:
#Database connection string
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5433'"
#DWH table to which data is ported
TABLE_NAME = 'staging.search_url'
#Connecting DB..
conn = psycopg2.connect(DSN)
print("Database connected...")
#conn.set_client_encoding('utf-8')
cur = conn.cursor()
ins_cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
product=[]
print('Passed URL : '+url)
browser.get(url)
names = browser.find_elements_by_xpath("//span[@class='pymv4e']")
product.clear()
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
#print(product)
if not product and "buy" not in url:
cmd=''
cmd="""INSERT into staging.no_resulting_urls(url) SELECT """+ "'"+url+ "'"
#print(cmd)
ins_cur.execute(cmd)
conn.commit()
upd_product_name_list.clear()
product_name.clear()
return product
##################################
def get_all_urls():
search_url_fetch="""WITH CTE AS (SELECT distinct id,url_to_be_searched,main_category FROM staging.search_url where
url_to_be_searched not IN( SELECT distinct CASE WHEN searched_url ILIKE '%buy%' THEN REPLACE(searched_url,'+buy','')
ELSE searched_url END FROM staging.pla_crawler_results where crawler_date::date=CURRENT_DATE AND url_to_be_searched<>''))
SELECT url_to_be_searched FROM CTE WHERE url_to_be_searched NOT IN(SELECT url FROM staging.no_resulting_urls) order by id"""
#search_url_fetch="""select url_to_be_searched from test.url where id >130 order by id"""
#search_url_fetch="""select url_to_be_searched from staging.search_url order by id"""
psql_cursor = conn.cursor()
psql_cursor.execute(search_url_fetch)
serach_url_list_r = psql_cursor.fetchall()
return serach_url_list_r
print('Fetched DB values')
##################################
#driver_close_variale=0
serach_url_list=get_all_urls()
print("Total urls need to process are : ", len(serach_url_list))
total_urls=0
total_urls=len(serach_url_list)
for row in serach_url_list:
sleep(10)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
#browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver/', chrome_options=option)
browser = webdriver.Chrome(executable_path='/users/it/Downloads/chromedrive/chromedriver', chrome_options=options)
browser.implicitly_wait(30)
#driver_close_variale=driver_close_variale+1
passed_url=''
new_url=''
passed_url=str(row)
passed_url=passed_url.replace(',)','')
passed_url=passed_url.replace('(','')
new_url=passed_url[1:len(passed_url)-1]
print("\n")
if total_urls <=10:
cur.execute("""DELETE FROM staging.no_resulting_urls""")
print('cleared the no result table succesfully')
conn.commit()
cur.execute("""WITH CTE AS
(
SELECT DISTINCT searched_url FROM staging.pla_crawler_results
WHERE crawler_date ::DATE=CURRENT_DATE
GROUP BY searched_url
having COUNT(searched_url) <4
)
DELETE FROM staging.pla_crawler_results
WHERE crawler_date ::DATE=CURRENT_DATE
AND searched_url IN(SELECT searched_url FROM CTE)""")
conn.commit()
filtered=[]
filtered.clear()
filtered = get_products(new_url)
saerched_url.clear()
if not filtered:
new_url=new_url+'+buy'
filtered = get_products(new_url)
if not filtered:
browser.close()
browser.quit()
pass
if filtered:
#print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(new_url)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
#print(saerched_url)
company_name_list = browser.find_elements_by_xpath("//div[@class='LbUacb']")
# use list comprehension to get the actual repo titles and not the selenium objects.
company = []
company.clear()
company = [x.text for x in company_name_list]
# print out all the titles.
#print('Company Name:')
#print(company, '\n')
-
-
-
-
-
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url,provider)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
#print('Driver close variable :'+str(driver_close_variale))
total_urls=total_urls-1
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url,provider) VALUES (%s, %s, %s,%s, %s,%s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
browser.close()
browser.quit()
pass
print("--- %s seconds ---" % (time.time() - start_time))
start_again()