Для тех, кто хочет использовать R (rvest
), вот код, который собирает данные реестра в кадр данных:
library(tidyverse)
library(magrittr)
library(rvest)
url <- "https://rolltide.com/roster.aspx?roster=226&path=football"
page <- url %>% read_html()
position <- list()
height <- list()
weight <- list()
number <- list()
name <- list()
yr <- list()
hometown <- list()
high.school <- list()
for (i in seq(1,250)) {
position[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[1]/div[2]/div[1]/span[1]/text()')) %>% xml_text %>% str_trim
height[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[1]/div[2]/div[1]/span[2]')) %>% xml_text
weight[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[1]/div[2]/div[1]/span[3]/text()')) %>% xml_text
number[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[1]/div[2]/div[2]/span/span')) %>% xml_text
name[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[1]/div[2]/div[2]/p/a')) %>% xml_text
yr[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[2]/div[1]/span[1]')) %>% xml_text
hometown[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[2]/div[1]/span[2]/text()')) %>% xml_text
high.school[[i]] <- page %>% html_nodes(xpath=paste0('//*[@id="main-content"]/article/div[4]/div/div[1]/div[2]/div[1]/section/ul/li[',i,']/div[1]/div[2]/div[1]/span[3]/text()')) %>% xml_text
}
position %<>% tibble %>% unnest
height %<>% tibble %>% unnest
weight %<>% tibble %>% unnest
number %<>% tibble %>% unnest
name %<>% tibble %>% unnest
yr %<>% tibble %>% unnest
hometown %<>% tibble %>% unnest
high.school %<>% tibble %>% unnest
final <- bind_cols(position,height,weight,number,name,yr,hometown,high.school)
names(final) <- c("position","height","weight","number","name","yr","hometown","high.school")
Хитрость заключается в выборе Xpath вместо селектора CSSи используйте xpath=
в вызове html_nodes()
.
Это, конечно, немного уродливее, но не требует Selenium или других тяжелых настроек.
EDIT :Вы должны посмотреть ответ QHarr выше для более упрощенного кода.