Попробуйте это:
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from bs4.element import Tag
driver = webdriver.Chrome("C:/Users/RoshanB/Desktop/sentiment1/chromedriver_win32/chromedriver")
driver.get('http://www.careratings.com/brief-rationale.aspx')
time.sleep(4)
companyArray = []
try:
search = driver.find_element_by_name('txtSearchCompany_brief')
search.send_keys("Reliance Capital Limited")
search.send_keys(Keys.RETURN)
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'lxml')
companies = soup.find("table",class_="table1")
for tag in companies.findChildren():
if isinstance(tag, Tag) and tag.name in 'a' and tag.has_attr('href'):
url_string = "http://www.careratings.com/"+tag['href'].replace (" ","%20")
""" **Open pdf file in new tab browser** """
open_pdf(url_string)
companyArray.append(tag.text)
except Exception as e:
print(e)
driver.quit()
print(companyArray)
O / P
Список компаний:
['Reliance Capital Limited', 'Dewan Housing Finance Corporation Limited Ratings of various Securitisation transactions', 'Gamut Infosystems Limited', 'Henraajh Feeds India Private Limited', 'Ramdhan Spintex', 'Tripurashwari Agro Product Private Limited', 'Kalyaneswari Polyfabs Private Limited', 'Rakesh Kumar Gupta Rice Mills Private Limited', 'Sri Satnam Jewells Private Limited', 'Pitambara Foods', 'Sujata Udit Builders Private Limited', 'Kavita Industries', 'Krishna Industries', 'Pallavi Motors Private Limited', 'Anjani Cotgin', 'Sarala Foods Private Limited', 'B.M. Enterprises', 'Bihani Agro Foods Private Limited', 'M V Agrotech Private Limited', 'J.S.R & Company', 'ARG Royal Ensign Developers Private Limited', 'Ranergy Solutions Private Limited', 'RSI Switchgear Private Limited', 'Jyoti Chandrashekhar Bawankule', 'Sadguru Engineers & Allied Services Private Limited', 'R B Rungta Steels & Food Products Private Limited', 'V. N. Marketing', 'Aussee Oats India Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Dewan Housing Finance Corporation Limited', 'Pacific Medical University', 'S. K. Pradhan Construction Company Private Limited', 'Stadmed Private Limited', 'Namra Finance Limited', 'S Kumars Associates', 'R. R. and Company Private Limited']
Если вы хотите удалить название компании, необходимо установить BeautifulSoup пакет
pip install beautifulsoup4==4.7.1
Где:
txtSearchCompany_brief - это имя поиска ввода
table1 - таблица результатов поискакласс
Скачать ссылку в файле pdf:
Chromedriver, Selenium - Автоматическая загрузка
Читать ссылку в файле PDF:
Как читать построчно в файле PDF с помощью PyPdf?
Открыть файл PDF в новой вкладке:
def open_pdf(url_string):
driver1 = webdriver.Chrome("C:/Users/RoshanB/Desktop/sentiment1/chromedriver_win32/chromedriver")
driver1.get(url_string)