Я придумал решение tidyverse с использованием пакета rvest. Ключевой концепцией является то, что вы можете использовать xpath таблицы HTML, как упоминалось здесь . Возможно, вы захотите переименовать столбцы с помощью функции tidyr rename . Также, как указано в предупреждающих сообщениях, вы можете оставить некоторые столбцы в виде символов и вручную исправить любые проблемные значения, а затем выполнить преобразование типа.
# install.package('rvest')
# install.package('tidyverse')
library(rvest)
#> Loading required package: xml2
library(tidyverse)
url <- "http://www.epa.gov/wqc/national-recommended-water-quality-criteria-human-health-criteria-table"
df <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="main-content"]/div[2]/div[1]/div/div/table') %>% # https://www.r-bloggers.com/using-rvest-to-scrape-an-html-table/
html_table() %>%
purrr::pluck(1) %>% # pluck out first item in the list which is a df
readr::type_convert(col_types = "cinnic") # c is character type, and i is integer, ...
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [70, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [80, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [85, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 3]: expected a number, but got 'Total'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 3]: expected a number, but got 'Total'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [63, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [9, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [10, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [37, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [68, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [80, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [85, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 5]: expected an integer, but got '—'
glimpse(df)
#> Observations: 122
#> Variables: 6
#> $ Pollutant <chr> …
#> $ `CAS Number` <int> …
#> $ `Human Health for the consumption of Water + Organism\n\t\t\t\t(µg/L)` <dbl> …
#> $ `Human Health for the consumption of Organism Only\n\t\t\t\t(µg/L)` <dbl> …
#> $ `Publication Year` <int> …
#> $ Notes <chr> …
Создан в 2019-10-18 пакетом Представить (v0.3.0)