Скребущий стол из интернета с питоном - PullRequest
0 голосов

Я пытаюсь получить закрытый стол (все более 1000 университетов) с этого сайта - https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking#!/page/0/length/25/sort_by/rank/sort_order/asc/cols/scores.

Для этой цели я использую следующие библиотеки - запросы и BeautifulSoup, и мой код:

import requests
from bs4 import BeautifulSoupenter 

html_content = requests.get('https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking#!/page/0/length/25/sort_by/rank/sort_order/asc/cols/stats')
soup = bs4.BeautifulSoup(html_content, 'lxml')

Тогда я ищу таблицу:

table = soup.find_all('table')[0]

Но в результате я не вижу саму таблицу <tbody>, строки <tr> и столбцы <td>.

HTML код:

Пожалуйста, помогите мне получить всю информацию с этого сайта и построить на нем информационный блок.

Ответы [ 2 ]

0 голосов
/ 04 мая 2018

Попробуйте следующий подход. Вы можете получить URL, если вы посмотрите на сетевую активность в разделе xhr на вкладке сети под devtools. Однако ваш скрипт должен выглядеть так, чтобы получить данные из этого ответа json.

import requests

URL = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

res = requests.get(URL)
for items in res.json()['data']:
    rank = items['rank']
    name = items['name']
    intstudents = items['stats_pc_intl_students']
    ratio = items['stats_female_male_ratio']
    print(rank,name,intstudents,ratio)

Выход:

1 University of Oxford 38% 46 : 54
2 University of Cambridge 35% 45 : 55
=3 California Institute of Technology 27% 31 : 69
=3 Stanford University 22% 42 : 58
5 Massachusetts Institute of Technology 34% 37 : 63
6 Harvard University 26% None
0 голосов
/ 04 мая 2018

Кажется, что сайт динамический, так как быстрая проверка источника показывает, что сама таблица не отображается в DOM. Следовательно, вам необходимо использовать инструмент управления браузером, например selenium:

from selenium import webdriver
from bs4 import BeautifulSoup as soup
import re
from collections import namedtuple
d = webdriver.Chrome('/Users/jamespetullo/Downloads/chromedriver')
d.get('https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking#!/page/0/length/25/sort_by/rank/sort_order/asc/cols/scores')
def page_results(html):
   school = namedtuple('school', ['ranking', 'name', 'location', 'scores'])
   rankings = [i.text for i in soup(html, 'lxml').find_all('td', {'class':'rank sorting_1 sorting_2'})]
   names = [i.text for i in soup(html, 'lxml').find_all('a', {'class':'ranking-institution-title'})]
   locations = [i.text for i in soup(html, 'lxml').find_all('div', {'class':'location'})]
   full_scores = [i.text for i in soup(html, 'lxml').find_all('td', {'class':re.compile('scores\s+[\w_]+\-score')})]
   final_scores = [dict(zip(['overall', 'teaching', 'research', 'citations', 'income', 'outlook'], full_scores[i:i+6])) for i in range(0, len(full_scores), 6)]
   return [school(*i) for i in zip(rankings, names, locations, final_scores)]

pages = [page_results(d.page_source)]
links = d.find_elements_by_tag_name('a')
for link in links:
   if link.text.isdigit():
      try:
        link.click()
        pages.append(page_results(d.page_source))
      except:
        pass

Пример вывода:

[[school(ranking=u'1', name=u'University of Oxford', location=u'United Kingdom', scores={'outlook': u'95.0', 'overall': u'94.3', 'research': u'99.5', 'citations': u'99.1', 'income': u'63.7', 'teaching': u'86.7'}), school(ranking=u'2', name=u'University of Cambridge', location=u'United Kingdom', scores={'outlook': u'93.0', 'overall': u'93.2', 'research': u'97.8', 'citations': u'97.5', 'income': u'51.5', 'teaching': u'87.8'}), school(ranking=u'=3', name=u'California Institute of Technology', location=u'United States', scores={'outlook': u'59.7', 'overall': u'93.0', 'research': u'97.5', 'citations': u'99.5', 'income': u'92.6', 'teaching': u'90.3'}), school(ranking=u'=3', name=u'Stanford University', location=u'United States', scores={'outlook': u'77.6', 'overall': u'93.0', 'research': u'96.7', 'citations': u'99.9', 'income': u'60.5', 'teaching': u'89.1'}), school(ranking=u'5', name=u'Massachusetts Institute of Technology', location=u'United States', scores={'outlook': u'87.6', 'overall': u'92.5', 'research': u'91.9', 'citations': u'100.0', 'income': u'88.4', 'teaching': u'87.3'}), school(ranking=u'6', name=u'Harvard University', location=u'United States', scores={'outlook': u'79.7', 'overall': u'91.8', 'research': u'98.4', 'citations': u'99.7', 'income': u'46.4', 'teaching': u'84.2'}), school(ranking=u'7', name=u'Princeton University', location=u'United States', scores={'outlook': u'78.7', 'overall': u'91.1', 'research': u'93.9', 'citations': u'99.6', 'income': u'58.0', 'teaching': u'85.7'}), school(ranking=u'8', name=u'Imperial College London', location=u'United Kingdom', scores={'outlook': u'96.6', 'overall': u'89.2', 'research': u'88.7', 'citations': u'96.7', 'income': u'71.6', 'teaching': u'81.7'}), school(ranking=u'9', name=u'University of Chicago', location=u'United States', scores={'outlook': u'69.6', 'overall': u'88.6', 'research': u'90.1', 'citations': u'99.4', 'income': u'39.8', 'teaching': u'85.3'}), school(ranking=u'=10', name=u'ETH Zurich \u2013 Swiss Federal Institute of Technology Zurich', location=u'Switzerland', scores={'outlook': u'98.1', 'overall': u'87.7', 'research': u'92.0', 'citations': u'94.3', 'income': u'60.3', 'teaching': u'76.4'}), school(ranking=u'=10', name=u'University of Pennsylvania', location=u'United States', scores={'outlook': u'61.3', 'overall': u'87.7', 'research': u'90.1', 'citations': u'98.5', 'income': u'56.9', 'teaching': u'83.7'}), school(ranking=u'12', name=u'Yale University', location=u'United States', scores={'outlook': u'64.6', 'overall': u'87.6', 'research': u'87.0', 'citations': u'98.4', 'income': u'45.1', 'teaching': u'86.7'}), school(ranking=u'13', name=u'Johns Hopkins University', location=u'United States', scores={'outlook': u'70.6', 'overall': u'86.5', 'research': u'88.1', 'citations': u'98.4', 'income': u'95.8', 'teaching': u'76.1'}), school(ranking=u'14', name=u'Columbia University', location=u'United States', scores={'outlook': u'76.6', 'overall': u'86.0', 'research': u'83.3', 'citations': u'98.8', 'income': u'41.3', 'teaching': u'82.2'}), school(ranking=u'15', name=u'University of California, Los Angeles', location=u'United States', scores={'outlook': u'59.5', 'overall': u'85.7', 'research': u'88.1', 'citations': u'97.9', 'income': u'48.6', 'teaching': u'80.7'}), school(ranking=u'16', name=u'UCL', location=u'United Kingdom', scores={'outlook': u'94.6', 'overall': u'85.3', 'research': u'88.2', 'citations': u'94.6', 'income': u'41.2', 'teaching': u'74.4'}), school(ranking=u'17', name=u'Duke University', location=u'United States', scores={'outlook': u'62.5', 'overall': u'85.1', 'research': u'80.6', 'citations': u'98.3', 'income': u'100.0', 'teaching': u'80.7'}), school(ranking=u'18', name=u'University of California, Berkeley', location=u'United States', scores={'outlook': u'64.5', 'overall': u'84.3', 'research': u'84.5', 'citations': u'99.8', 'income': u'37.5', 'teaching': u'77.4'}), school(ranking=u'19', name=u'Cornell University', location=u'United States', scores={'outlook': u'69.2', 'overall': u'84.2', 'research': u'86.6', 'citations': u'97.6', 'income': u'34.6', 'teaching': u'76.2'}), school(ranking=u'20', name=u'Northwestern University', location=u'United States', scores={'outlook': u'59.2', 'overall': u'83.3', 'research': u'86.7', 'citations': u'96.9', 'income': u'78.2', 'teaching': u'72.6'}), school(ranking=u'21', name=u'University of Michigan', location=u'United States', scores={'outlook': u'55.8', 'overall': u'83.1', 'research': u'86.3', 'citations': u'95.7', 'income': u'46.2', 'teaching': u'77.2'}), school(ranking=u'=22', name=u'National University of Singapore', location=u'Singapore', scores={'outlook': u'95.8', 'overall': u'82.8', 'research': u'88.2', 'citations': u'81.3', 'income': u'61.9', 'teaching': u'77.4'}), school(ranking=u'=22', name=u'University of Toronto', location=u'Canada', scores={'outlook': u'80.1', 'overall': u'82.8', 'research': u'84.8', 'citations': u'92.6', 'income': u'46.5', 'teaching': u'74.6'}), school(ranking=u'24', name=u'Carnegie Mellon University', location=u'United States', scores={'outlook': u'79.1', 'overall': u'81.9', 'research': u'83.7', 'citations': u'99.7', 'income': u'50.4', 'teaching': u'65.8'}), school(ranking=u'=25', name=u'London School of Economics and Political Science', location=u'United Kingdom', scores={'outlook': u'92.2', 'overall': u'79.4', 'research': u'72.0', 'citations': u'94.9', 'income': u'33.7', 'teaching': u'71.8'})]]
...