Сайт динамический, поэтому вам придется использовать инструмент управления браузером, например selenium
:
from bs4 import BeautifulSoup as soup
import re, time
from selenium import webdriver
def get_directors(_html):
_names = [i.text for i in soup(_html, 'html.parser').find_all('div', {'class':'name ng-binding'})]
return _names[_names.index('Compensation for all Key Executives')+1:-1]
_board = {}
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://www.morningstar.com/stocks/xnys/mmm/quote.html')
time.sleep(5)
_exec = d.find_elements_by_class_name("mds-button")
_exec[8].click()
time.sleep(3)
d.find_element_by_link_text("Board of Directors").click()
time.sleep(3)
full_directors = d.find_elements_by_class_name('person-row')[19:31]
for _name, _link in zip(get_directors(d.page_source), full_directors):
_link.click()
time.sleep(3)
d.find_element_by_link_text("Profile").click()
time.sleep(3)
_board[_name] = soup(d.page_source, 'html.parser').find_all('div', {'class':'biography'})[-1].text
_link.click()
time.sleep(3)
print(_board)
Вывод (сокращен для экономии места):
{'Inge G. Thulin': '\nBiography\n\n Mr. Thulin is the Chairman of the Board, President and Chief Executive Officer of 3M Company. Mr. Thulin served as President and Chief Executive Officer of 3M Company from ....', 'Sondra L. Barbour': '\nBiography\n\n Ms. Barbour is Executive Vice President, Information Systems and Global Solutions, Lockheed Martin Corporation, a high technology aerospace and defense company. Since joini....', 'Thomas K. Brown': '\nBiography\n\n Mr. Brown is the Retired Group Vice President, Global Purchasing, Ford Motor Company, a global automotive industry leader. Mr. Brown served in various leadership capacities....', 'David B. Dillon': '\nBiography\n\n —\n \n....', 'Michael L Eskew': '\nBiography\n\n Mr. Eskew is the Retired Chairman of the Board and Chief Executive Officer, United Parcel Service, Inc., a provider of specialized transportation and logistics services. Mr....', 'Herbert L. Henkel': '\nBiography\n\n Mr. Henkel is the Retired Chairman of the Board and Chief Executive Officer, Ingersoll-Rand plc, a manufacturer of industrial products and components. Mr. Henkel retired as....', 'Amy Hood': "\nBiography\n\n On August 13, 2017, the Board of Directors of 3M Company elected Amy E. Hood to the Company's Board of Directors, effective August 13, 2017. At Microsoft, Hood is responsib....", 'Muhtar Kent': "\nBiography\n\n Mr. Kent is the Chairman of the Board and Chief Executive Officer, The Coca-Cola Company, the world's largest beverage company. Mr. Kent has held the position of Chairman o....", 'Edward M. Liddy': '\nBiography\n\n Mr. Liddy is the Retired Chairman of the Board and Chief Executive Officer, The Allstate Corporation, and former Partner at Clayton, Dubilier & Rice, LLC, a private equity ....', 'Dambisa F. Moyo': "\nBiography\n\n On August 12, 2018, the Board of Directors of 3M Company elected Dambisa F. Moyo to the Company's Board of Directors, effective August 12, 2018. Dr. Moyo is the founder and....", 'Gregory R. Page': "\nBiography\n\n On February 1, 2016, the Board of Directors of 3M Company elected Gregory R. Page to the Company's Board of Directors, effective February 1, 2016. Page previously was Cargi....", 'Patricia A. Woertz': "\nBiography\n\n On February 1, 2016, the Board of Directors of 3M Company elected Patricia A. Woertz to the Company's Board of Directors, effective at the close of business on February 2, ...."}
Edit:
Запись результатов в csv
:
import csv
with open('filename.csv', 'w') as f:
write = csv.writer(f)
write.writerows([['name', 'biography'], *map(list, _board.items())])
Чтобы создать более общее решение для обработки различных URL-адресов (возможно, созданных из содержимого в списке):
def scrape_bios(_driver:webdriver, _url:str) -> dict:
_driver.get(_url)
time.sleep(5)
_exec = _driver.find_elements_by_class_name("mds-button")
_exec[8].click()
time.sleep(3)
_board = {}
_driver.find_element_by_link_text("Board of Directors").click()
time.sleep(3)
full_directors = _driver.find_elements_by_class_name('person-row')[19:31]
for _name, _link in zip(get_directors(_driver.page_source), full_directors):
_link.click()
time.sleep(3)
_driver.find_element_by_link_text("Profile").click()
time.sleep(3)
_board[_name] = soup(_driver.page_source, 'html.parser').find_all('div', {'class':'biography'})[-1].text
_link.click()
time.sleep(3)
return _board
Теперь вы можете просмотреть список URL:
d = webdriver.Chrome('/path/to/chromedriver')
for url in urls:
_results = scrape_bios(d, url)