Я пытаюсь программно извлекать и хранить данные в (именованных) фреймах данных с соответствующими именами столбцов и т. Д. Мой цикл for не работает для извлечения данных и их сохранения в соответствующим образом именованном фрейме данных (см. Pg $ Df_names),Приветствую за вашу помощь.Фиксированный код доступен для ознакомления в iv) - Пожалуйста, предложите любые улучшения, которые у вас могут быть.
necessary_packages <- c("readr", "readxl", "xlsx","stringr", "dplyr", "tidyr", "rvest", "xml2","SnowballC", "httr", "Rcurl", "rvest")
new_packages <- necessary_packages[!(necessary_packages %in%
installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(necessary_packages, require, character.only = TRUE)
i) Создайте фрейм данных, содержащий все URL-адреса данных RBA и имена фреймов данных:
Base_URL <- "http://www.rba.gov.au/statistics/tables/"
ii) Прочитайте html в R, проанализируйте текст, для любых файлов csv или excel:
pg <- as.data.frame((html_attr(html_nodes(read_html(Base_URL), "a"), "href")), stringsAsFactors = FALSE)
file_types <- c(".xls", ".csv", ".xlsx")
pg <- as.data.frame(paste0(Base_URL, gsub("/statistics/tables/",
"",
unique(grep(paste(file_types, collapse = "|"),
pg[,1], value = TRUE)))), stringsAsFactors = FALSE)
colnames(pg) <- "urls"
iii) Сохраните вектор символов имен имен данных:
pg$Df_names <- gsub("\\-|[[:punct:]]", "_",
gsub("\\..*",
"",
sapply(strsplit(pg$urls, split= "\\/"), function(x){x[length(x)]})))
iv) Сохранять каждый файл Excel как отдельный фрейм данных, именуемый каждым элементом символьного вектора:
df_names_list <- NULL
files_to_remove <- NULL
df_list <- list()
h = 1
i = 1
j = 1
for (i in 1:nrow(pg)) {
if (length(grep(".xls", pg$urls[i])) > 0) {
tryit <- try(GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xls"), overwrite = TRUE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else{
GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xls"), overwrite = TRUE))
shiet <- excel_sheets(tf)[which(
excel_sheets(tf) != "Summary" &
excel_sheets(tf) != "Notes" &
excel_sheets(tf) != "Series breaks"
)]
}
for (j in 1:length(shiet)) {
tryit <- try(tmp1 <-
as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
tmp1 <-
as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE))
}
if (ncol(tmp1) > 2) {
tryit <- try(X <- as.data.frame(read_xls(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
X <- as.data.frame(read_xls(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
))
}
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
paste0(pg$Df_names[i], "_", shiet[j]),
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
} else if (length(grep(".xslx", pg$urls[i])) > 0) {
tryit <- try(GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xlsx"), overwrite = TRUE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else{
GET(pg$urls[i], write_disk(tf <-
tempfile(fileext = ".xlsx"), overwrite = TRUE))
shiet <- excel_sheets(tf)[which(
excel_sheets(tf) != "Summary" &
excel_sheets(tf) != "Notes" &
excel_sheets(tf) != "Series breaks"
)]
}
for (j in 1:length(shiet)) {
tryit <- try(tmp1 <-
as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
tmp1 <-
as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE))
}
if (ncol(tmp1) > 2) {
tryit <- try(X <- as.data.frame(read_xlsx(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
)))
if (inherits(tryit, "try-error")) {
j <- j + 1
} else{
X <- as.data.frame(read_xlsx(
gsub('[\\]', '\\/', tf),
skip = 11,
col_names = FALSE,
sheet = shiet[j]
))
}
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
paste0(pg$Df_names[i], "_", shiet[j]),
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
} else if (length(grep(".csv", pg$urls[i])) > 0) {
tryit <- try(X <- as.data.frame(read.csv(pg$urls[i],
skip = 11,
header = FALSE)))
if (inherits(tryit, "try-error")) {
i <- i + 1
} else {
X <- as.data.frame(read.csv(pg$urls[i],
skip = 11,
header = FALSE))
}
tmp1 <- as.data.frame(read.csv(pg$urls[i]))
if (ncol(tmp1) >= 2) {
colnames(X) <- c("Date",
c(trimws(
gsub(
"for series breaks.*|[[:punct:]]| |Seenot",
"",
tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
language = "porter"))
),
which = c("both", "left", "right")
)))
df_list[[h]] <- X
df_names_list <-
c(df_names_list, gsub(" ", "_", trimws(
pg$Df_names[i],
which = c("both", "left", "right")
)))
h <- h + 1
} else{
rm(tmp1)
j <- j + 1
}
}
}