htmltab
можно использовать для удаления этих вики-таблиц.
library(htmltab)
#data cleaning steps
bFun <- function(node) {
x <- XML::xmlValue(node)
gsub("\\s[<†‡].*$", "", iconv(x, from = 'UTF-8', to = "Windows-1252", sub="byte"))
}
df1 <- htmltab(doc = "https://en.wikipedia.org/wiki/List_of_Australian_Open_men%27s_singles_champions",
which = 4,
rm_superscript = F,
bodyFun = bFun) #this function is not required if you are executing the code from Mac
head(df1)
, что дает
# Year[f] Country Champion Country Runner-up Score in the final[4][14]
#2 1969 AUS Rod Laver[b] ESP Andrés Gimeno 6–3, 6–4, 7–5
#3 1970 USA Arthur Ashe AUS Dick Crealy 6–4, 9–7, 6–2
#4 1971 AUS Ken Rosewall USA Arthur Ashe 6–1, 7–5, 6–3
#5 1972 AUS Ken Rosewall AUS Malcolm Anderson 7–6(7–2), 6–3, 7–5
#6 1973 AUS John Newcombe NZL Onny Parun 6–3, 6–7, 7–5, 6–1
#7 1974 USA Jimmy Connors AUS Phil Dent 7–6(9–7), 6–4, 4–6, 6–3
и
df2 <- htmltab(doc = "https://en.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions",
which = 3,
rm_superscript = F,
bodyFun = bFun) #this function is not required if you are executing the code from Mac
head(df2)
дает
# Year[d] Country Champion Country Runner-up Score in the final[4]
#2 1877 BRI[e] Spencer Gore BRI William Marshall 6–1, 6–2, 6–4
#3 1878 BRI Frank Hadow BRI Spencer Gore 7–5, 6–1, 9–7
#4 1879 BRI John Hartley BRI Vere St. Leger Goold 6–2, 6–4, 6–2
#5 1880 BRI John Hartley BRI Herbert Lawford 6–3, 6–2, 2–6, 6–3
#6 1881 BRI William Renshaw BRI John Hartley 6–0, 6–1, 6–1
#7 1882 BRI William Renshaw BRI Ernest Renshaw 6–1, 2–6, 4–6, 6–2, 6–2