Почему я получаю сообщение об ошибке, указывающее на значения Inf, когда пытаюсь построить счетчик по времени в R? - PullRequest
1 голос
/ 31 марта 2020

Я использую код, указанный в этом ответе , чтобы сгенерировать этот график

library(rvest)

cachedir <- "cache"
if (!dir.exists(cachedir)) dir.create(cachedir)

URL <- "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"

html <- read_html(URL)
csvlinks <- html_nodes(html, "td span") %>%
  html_nodes("a") %>%
  html_attr("href") %>%
  grep("csv$", ., value = TRUE) %>%
  paste0("https://raw.githubusercontent.com", .) %>%
  gsub("/blob", "", .)
csvfiles <- file.path(cachedir, basename(csvlinks))
donothave <- !file.exists(csvfiles)
csvlinks <- csvlinks[donothave]
csvfiles <- csvfiles[donothave]


ign <- Map(function(l,f) download.file(l, f, quiet=TRUE), csvlinks, csvfiles)

csvfiles2 <- list.files(path = cachedir, pattern = "csv$", full.names = TRUE)


list_of_frames <- lapply(csvfiles2, read.csv, stringsAsFactors = FALSE)

list_of_frames2 <- lapply(list_of_frames, function(x) {
  colnames(x) <- gsub(".*\\.", "", colnames(x))
  x
})

renamer <- c(
  State = "Province_State",
  Region = "Country_Region",
  Update = "Last_Update",
  Latitude = "Lat",
  Longitude = "Long_"
)
list_of_frames3 <- lapply(list_of_frames2, function(x) {
  nms <- colnames(x)
  colnames(x) <- ifelse(nms %in% names(renamer), renamer[ nms ], nms)
  x
})


alldata <- data.table::rbindlist(list_of_frames3, fill = TRUE)

fmts <- c("%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S")
timestamp <- rep(Sys.time()[NA], nrow(alldata))
for (fmt in fmts) {
  if (!any(isna <- is.na(timestamp))) next
  timestamp[isna] <- as.POSIXct(alldata$Last_Update[isna], format = fmt)
}

alldata$Last_Update <- timestamp


Atlantic <- alldata[alldata$Admin2=="Atlantic",]
Atlantic[,Atlantic$Confirmed]
#[1]  5  6  6 12 10 14 17 24 29
Atlantic[,Atlantic$Last_Update]
#[1] "2020-03-22 23:45:00 EDT" "2020-03-23 23:19:34 EDT"
#[3] "2020-03-24 23:37:31 EDT" "2020-03-25 23:33:19 EDT"
#[5] "2020-03-26 23:48:35 EDT" "2020-03-27 22:14:55 EDT"
#[7] "2020-03-28 23:05:37 EDT" "2020-03-29 23:08:25 EDT"
#[9] "2020-03-30 22:52:45 EDT"
plot("Confirmed", "Last_update", Atlantic, xaxt='n')
#Error in plot.window(...) : need finite 'xlim' values
#In addition: Warning messages:
#1: In xy.coords(x, y, xlabel, ylabel, log) : NAs introduced by coercion
#2: In xy.coords(x, y, xlabel, ylabel, log) : NAs introduced by coercion
#3: In min(x) : no non-missing arguments to min; returning Inf
#4: In max(x) : no non-missing arguments to max; returning -Inf
#5: In min(x) : no non-missing arguments to min; returning Inf
#6: In max(x) : no non-missing arguments to max; returning -Inf
axis.Date(1,at=alldata$Last_Update,labels=format(alldata$Last_Update,"%y-m-%d"),las=2)

Я попытался изменить структуру формата времени, но безрезультатно.

Ответы [ 2 ]

1 голос
/ 01 апреля 2020
# Install pacakges if they are not already installed:
necessary_packages <- c("rvest", "tidyverse")

# Create a vector containing the names of any packages needing installation:
new_packages <- necessary_packages[!(necessary_packages %in%
                                       installed.packages()[, "Package"])]

# If the vector has more than 0 values, install the new pacakges
# (and their) associated dependencies:
if (length(new_packages) > 0) {
  install.packages(new_packages, dependencies = TRUE)
}

# Initialise the packages in the session:
lapply(necessary_packages, require, character.only = TRUE)

# Store a scalar that's values is the github url: URL => vector:
URL <-
  "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"

# Store a scalar of the directory path where files are to be stored:
# Enter your desired path here !
covid_19_csv_dir_path <- "C:/Users/.../Documents/covid_19_csvs"

# If the directory doesn't exist create it:
if (!(dir.exists(covid_19_csv_dir_path))) {
  dir.create(covid_19_csv_dir_path)
}

# Store a vector of URLs: csvlinks => character vector:
csvlinks <-
  read_html(URL) %>%
  html_nodes("a") %>%
  html_attr("href") %>%
  grep("csv$", ., value = TRUE) %>%
  paste0("https://raw.githubusercontent.com", .) %>%
  gsub("/blob", "", .)

# Store a vector of csv names: csv_names => vector
csv_names <- sub(".*\\/", "", csvlinks)

# Check if the file already exists in directory: csvs_stored_locally => vector:
csvs_stored_locally <- list.files(covid_19_csv_dir_path)

# Subset the csvlinks vector to contain those csvs that
# require downloading: csvs_to_be_stored => vector
csvs_to_be_stored <- csvlinks[!(csv_names %in% csvs_stored_locally)]

# Conditionally execute the next segment if there are csvs to store:
if (length(csvs_to_be_stored) > 0) {
  # Create a vector of the date of each version: version_dates => vector
  version_dates <- as.Date(gsub("\\.csv", "",
                                csv_names[!(csv_names %in% csvs_stored_locally)]),
                           "%m-%d-%Y")

  # Create a vector of names for each dataframe in the list: df_names => vector
  df_names <- paste0("x_", gsub("[[:punct:]]", "_", version_dates))

  # Create an empty list to store the dataframes: df_list => list
  df_list <- vector("list", length(csvs_to_be_stored))

  # Store the dataframes in the named list: df_list => list
  df_list <- lapply(seq_along(csvs_to_be_stored),
                    function(i) {
                      read.csv(csvs_to_be_stored[i], sep = ",")
                    })

  # Clean the vector names in each dataframe: cleaned_df_names_list => df_list
  df_list <- setNames(lapply(df_list,
                             function(x) {
                               names(x) <- gsub("[[:punct:]]|\\s+|.*\\.", "",
                                                trimws(names(x), "both"))
                               return(x)
                             }),
                      df_names)

  # Store the version date as a variable: df_list => list
  df_list <-
    mapply(cbind,
           "version_date" = version_dates,
           df_list,
           SIMPLIFY = FALSE)

  # Store the csv files in the directory: stdout
  lapply(seq_along(df_list),
         function(i) {
           write.csv(as.data.frame(df_list[i]),
                     paste0(covid_19_csv_dir_path, "/",
                            sub("\\/", "", csv_names[i])),
                     row.names = FALSE)
         })
}

# If there are files stored in the directory read them in:
if (length(csvs_stored_locally) > 0) {
  # Allocate some memory for a list of dataframes: ls_csvs => list
  ls_csvs <- vector("list", length(csvs_stored_locally))

  # Read the csvs as dataframes into a list and name them appropriately: ls_csvs => list
  ls_csvs <-
    setNames(lapply(seq_along(csvs_stored_locally), function(i) {
      read.csv(paste0(covid_19_csv_dir_path, "/", csvs_stored_locally[i]))
    }),
    paste0("x_", gsub(
      "[[:punct:]]", "_",
      gsub("\\.csv", "", csvs_stored_locally)
    )))
}

# If csvs have been downloaded from the github in this execution, combine
# the list of stored dataframes with the those downloaded: combined_df_list => list
if (exists("df_list") & exists("ls_csvs")) {

  # Combine the two lists: combined_df_list => list
  combined_df_list <- c(ls_csvs, df_list)

  # Remove df_list, ls_csvs variables from the global environment:
  rm(df_list, ls_csvs)

  # If ls_csvs but not df_list exists in the global environment:
} else if (exists("ls_csvs") & !(exists("df_list"))) {

  # Rename the ls_csvs list: combined_df_list => list
  combined_df_list <- ls_csvs

  # Remove ls_csv variables from the global environment:
  rm(ls_csvs)


# Otherwise:
} else{

  # Rename the df_list list: combined_df_list => list
  combined_df_list <- df_list

}

# Re-allocate some memory:
gc()

# Store a function to row-bind all dataframes in the list: rbind_all_columns => function
rbind_all_columns <- function(x, y) {
  x[, c(as.character(setdiff(colnames(y), colnames(x))))] <- NA
  y[, c(as.character(setdiff(colnames(x), colnames(y))))] <- NA
  return(rbind(x, y))
}

# Apply the function and store the result as a dataframe: df => data.frame
df <-
  Reduce(function(x, y) {
    rbind_all_columns(x, y)
  }, combined_df_list)

# Subset out New York and Washington: washington_vs_ny => data.frame 
washington_vs_ny <- df[which(df$State == "Washington" | df$State == "New York"),]

# Clean the data.frame: washington_vs_nyordered => data.frame
washington_vs_nyordered <- within(washington_vs_ny[order(washington_vs_ny$version_date),],
                             {
                               Confirmed <- ifelse(is.na(Confirmed), 0, Confirmed)
                               Deaths <- ifelse(is.na(Deaths), 0, Deaths)
                               Recovered <- ifelse(is.na(Recovered), 0, Recovered)
                             }
)[,c("version_date", "State", "Confirmed", "Deaths", "Recovered")]

# Reshape the data for charting: chart_df => data.frame: 
chart_data <- within(reshape(washington_vs_nyordered,
        direction = "long", 
        varying = c("Confirmed", "Deaths", "Recovered"),
        v.names=c("vars"),
        idvar = c("version_date", "State"),
        timevar = "vals", 
        times = c("Confirmed", "Deaths", "Recovered"),
        new.row.names = 1:(length(c("Confirmed", "Deaths", "Recovered")) * 
                             nrow(washington_vs_nyordered))
      ), {version_date <- as.Date(as.character(version_date), "%Y-%m-%d")})

# Chart the data: 
ggplot(chart_data, aes(x = version_date, y = vars, colour = vals)) +
  geom_line() + 
  facet_wrap(.~State)
1 голос
/ 31 марта 2020

С этой линией вы вызываете базовый сюжет R

plot("Confirmed", "Last_update", Atlantic, xaxt='n')

И наносите на карту символ против другого символа, который не будет работать. Так что, скорее всего, вам нужно что-то вроде этого:

with(as.data.frame(Atlantic),plot(Last_Update,Confirmed,xaxt="n"))
axis.POSIXct(1,at=Atlantic$Last_Update,
labels=format(Atlantic$Last_Update,"%y-%m-%d"),las=2)

enter image description here

...