# Install pacakges if they are not already installed:
necessary_packages <- c("rvest", "tidyverse")
# Create a vector containing the names of any packages needing installation:
new_packages <- necessary_packages[!(necessary_packages %in%
installed.packages()[, "Package"])]
# If the vector has more than 0 values, install the new pacakges
# (and their) associated dependencies:
if (length(new_packages) > 0) {
install.packages(new_packages, dependencies = TRUE)
}
# Initialise the packages in the session:
lapply(necessary_packages, require, character.only = TRUE)
# Store a scalar that's values is the github url: URL => vector:
URL <-
"https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
# Store a scalar of the directory path where files are to be stored:
# Enter your desired path here !
covid_19_csv_dir_path <- "C:/Users/.../Documents/covid_19_csvs"
# If the directory doesn't exist create it:
if (!(dir.exists(covid_19_csv_dir_path))) {
dir.create(covid_19_csv_dir_path)
}
# Store a vector of URLs: csvlinks => character vector:
csvlinks <-
read_html(URL) %>%
html_nodes("a") %>%
html_attr("href") %>%
grep("csv$", ., value = TRUE) %>%
paste0("https://raw.githubusercontent.com", .) %>%
gsub("/blob", "", .)
# Store a vector of csv names: csv_names => vector
csv_names <- sub(".*\\/", "", csvlinks)
# Check if the file already exists in directory: csvs_stored_locally => vector:
csvs_stored_locally <- list.files(covid_19_csv_dir_path)
# Subset the csvlinks vector to contain those csvs that
# require downloading: csvs_to_be_stored => vector
csvs_to_be_stored <- csvlinks[!(csv_names %in% csvs_stored_locally)]
# Conditionally execute the next segment if there are csvs to store:
if (length(csvs_to_be_stored) > 0) {
# Create a vector of the date of each version: version_dates => vector
version_dates <- as.Date(gsub("\\.csv", "",
csv_names[!(csv_names %in% csvs_stored_locally)]),
"%m-%d-%Y")
# Create a vector of names for each dataframe in the list: df_names => vector
df_names <- paste0("x_", gsub("[[:punct:]]", "_", version_dates))
# Create an empty list to store the dataframes: df_list => list
df_list <- vector("list", length(csvs_to_be_stored))
# Store the dataframes in the named list: df_list => list
df_list <- lapply(seq_along(csvs_to_be_stored),
function(i) {
read.csv(csvs_to_be_stored[i], sep = ",")
})
# Clean the vector names in each dataframe: cleaned_df_names_list => df_list
df_list <- setNames(lapply(df_list,
function(x) {
names(x) <- gsub("[[:punct:]]|\\s+|.*\\.", "",
trimws(names(x), "both"))
return(x)
}),
df_names)
# Store the version date as a variable: df_list => list
df_list <-
mapply(cbind,
"version_date" = version_dates,
df_list,
SIMPLIFY = FALSE)
# Store the csv files in the directory: stdout
lapply(seq_along(df_list),
function(i) {
write.csv(as.data.frame(df_list[i]),
paste0(covid_19_csv_dir_path, "/",
sub("\\/", "", csv_names[i])),
row.names = FALSE)
})
}
# If there are files stored in the directory read them in:
if (length(csvs_stored_locally) > 0) {
# Allocate some memory for a list of dataframes: ls_csvs => list
ls_csvs <- vector("list", length(csvs_stored_locally))
# Read the csvs as dataframes into a list and name them appropriately: ls_csvs => list
ls_csvs <-
setNames(lapply(seq_along(csvs_stored_locally), function(i) {
read.csv(paste0(covid_19_csv_dir_path, "/", csvs_stored_locally[i]))
}),
paste0("x_", gsub(
"[[:punct:]]", "_",
gsub("\\.csv", "", csvs_stored_locally)
)))
}
# If csvs have been downloaded from the github in this execution, combine
# the list of stored dataframes with the those downloaded: combined_df_list => list
if (exists("df_list") & exists("ls_csvs")) {
# Combine the two lists: combined_df_list => list
combined_df_list <- c(ls_csvs, df_list)
# Remove df_list, ls_csvs variables from the global environment:
rm(df_list, ls_csvs)
# If ls_csvs but not df_list exists in the global environment:
} else if (exists("ls_csvs") & !(exists("df_list"))) {
# Rename the ls_csvs list: combined_df_list => list
combined_df_list <- ls_csvs
# Remove ls_csv variables from the global environment:
rm(ls_csvs)
# Otherwise:
} else{
# Rename the df_list list: combined_df_list => list
combined_df_list <- df_list
}
# Re-allocate some memory:
gc()
# Store a function to row-bind all dataframes in the list: rbind_all_columns => function
rbind_all_columns <- function(x, y) {
x[, c(as.character(setdiff(colnames(y), colnames(x))))] <- NA
y[, c(as.character(setdiff(colnames(x), colnames(y))))] <- NA
return(rbind(x, y))
}
# Apply the function and store the result as a dataframe: df => data.frame
df <-
Reduce(function(x, y) {
rbind_all_columns(x, y)
}, combined_df_list)
# Subset out New York and Washington: washington_vs_ny => data.frame
washington_vs_ny <- df[which(df$State == "Washington" | df$State == "New York"),]
# Clean the data.frame: washington_vs_nyordered => data.frame
washington_vs_nyordered <- within(washington_vs_ny[order(washington_vs_ny$version_date),],
{
Confirmed <- ifelse(is.na(Confirmed), 0, Confirmed)
Deaths <- ifelse(is.na(Deaths), 0, Deaths)
Recovered <- ifelse(is.na(Recovered), 0, Recovered)
}
)[,c("version_date", "State", "Confirmed", "Deaths", "Recovered")]
# Reshape the data for charting: chart_df => data.frame:
chart_data <- within(reshape(washington_vs_nyordered,
direction = "long",
varying = c("Confirmed", "Deaths", "Recovered"),
v.names=c("vars"),
idvar = c("version_date", "State"),
timevar = "vals",
times = c("Confirmed", "Deaths", "Recovered"),
new.row.names = 1:(length(c("Confirmed", "Deaths", "Recovered")) *
nrow(washington_vs_nyordered))
), {version_date <- as.Date(as.character(version_date), "%Y-%m-%d")})
# Chart the data:
ggplot(chart_data, aes(x = version_date, y = vars, colour = vals)) +
geom_line() +
facet_wrap(.~State)