Группировка даты и времени - PullRequest
0 голосов
/ 31 января 2020

Существует ли пакет R, который обеспечивает группировку дат и времени для всех типичных единиц времени (секунд, минут и т. Д. c.)?

Или, другими словами, как мне усечь умножает определенную единицу времени?

Фон

Мне часто приходится группировать ответы на запросы HTTP по 15-секундным интервалам.

Функция lubridate::floor_date() и lubridate::ceiling_date() заставили меня начать , но

Я придумал что-то, основанное на модуле (%%), но мне кажется, что я заново изобретаю колесо для работы с атомными c компонентами времени.

Функция определение

library(magrittr)

group_date <- function(
  x,
  interval = 15,
  unit = c(
    "seconds",
    "minutes",
    "hours",
    "days",
    "weeks",
    "months",
    "bimonths",
    "quarters",
    "seasons",
    "halfyears",
    "years"
  )
) {
  # Validate units:
  unit <- match.arg(unit)

  # Possibly base units on "basic units" as {lubridate} does:
  # parsed_unit <- lubridate:::parse_period_unit(unit)
  # n <- parsed_unit$n
  # basic_unit <- lubridate:::standardise_period_names(parsed_unit$unit)

  if (unit %in% c("bimonths", "halfyears", "season")) {
    stop(stringr::str_glue("Unit '{unit}' not supported yet"))
  }
  # No clue how these would need to be handled yet

  # Extract unit value:
  unit_value <- dplyr::case_when(
    unit == "seconds" ~ as.numeric(lubridate::second(x)),
    unit == "minutes" ~ as.numeric(lubridate::minute(x)),
    unit == "hours" ~ as.numeric(lubridate::hour(x)),
    unit == "days" ~ as.numeric(lubridate::day(x)),
    unit == "weeks" ~ as.numeric(lubridate::isoweek(x)),
    unit == "months" ~ as.numeric(lubridate::month(x)),
    unit == "quarters" ~ as.numeric(lubridate::quarter(x)),
    unit == "year" ~ as.numeric(lubridate::year(x))
  )

  offset_factor <- dplyr::case_when(
    unit == "seconds" ~ 1,
    unit == "minutes" ~ 60,
    unit == "hours" ~ 60 * 60,
    unit == "days" ~ 60 * 60 * 24,
    unit == "weeks" ~ NA_real_, # Seconds per week -> no clue how to do that,
    unit == "months" ~ NA_real_, # Seconds per month -> no clue how to do that
    unit == "quarters" ~ NA_real_, # Seconds per quarter -> no clue how to do that
    unit == "year" ~ NA_real_ # Seconds per year -> no clue how to do that
  )

  # Calculate time offset to lower group boundary:
  time_offset <- unit_value %% interval

  # Apply offset:
  x - (time_offset * offset_factor)
}

секунды группировки

x <- c(
  "2020-01-31 13:01:14",
  "2020-01-31 13:01:15",
  "2020-01-31 13:01:16",
  "2020-01-31 13:01:29",
  "2020-01-31 13:01:30",
  "2020-01-31 13:01:31",
  "2020-01-31 13:01:44",
  "2020-01-31 13:01:45",
  "2020-01-31 13:01:46",
  "2020-01-31 13:01:59",
  "2020-01-31 13:02:00",
  "2020-01-31 13:02:01"
) %>%
  lubridate::ymd_hms()

x %>% group_date()
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:15 UTC"
#>  [3] "2020-01-31 13:01:15 UTC" "2020-01-31 13:01:15 UTC"
#>  [5] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [7] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:45 UTC"
#>  [9] "2020-01-31 13:01:45 UTC" "2020-01-31 13:01:45 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

x %>% group_date(30)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [3] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [5] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [7] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [9] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

x %>% group_date(45)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [3] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [5] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [7] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:45 UTC"
#>  [9] "2020-01-31 13:01:45 UTC" "2020-01-31 13:01:45 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

x %>% group_date(60)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [3] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [5] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [7] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [9] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

минуты группировки

x <- c(
  "2020-01-31 13:04:00",
  "2020-01-31 13:05:00",
  "2020-01-31 13:06:00",
  "2020-01-31 13:29:00",
  "2020-01-31 13:30:00",
  "2020-01-31 13:31:00",
  "2020-01-31 13:44:00",
  "2020-01-31 13:45:00",
  "2020-01-31 13:46:00"
) %>%
  lubridate::ymd_hms()

x %>% group_date(15, "minutes")
#> [1] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [3] "2020-01-31 13:00:00 UTC" "2020-01-31 13:15:00 UTC"
#> [5] "2020-01-31 13:30:00 UTC" "2020-01-31 13:30:00 UTC"
#> [7] "2020-01-31 13:30:00 UTC" "2020-01-31 13:45:00 UTC"
#> [9] "2020-01-31 13:45:00 UTC"

x %>% group_date(30, "minutes")
#> [1] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [3] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [5] "2020-01-31 13:30:00 UTC" "2020-01-31 13:30:00 UTC"
#> [7] "2020-01-31 13:30:00 UTC" "2020-01-31 13:30:00 UTC"
#> [9] "2020-01-31 13:30:00 UTC"

x %>% group_date(45, "minutes")
#> [1] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [3] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [5] "2020-01-31 13:00:00 UTC" "2020-01-31 13:00:00 UTC"
#> [7] "2020-01-31 13:00:00 UTC" "2020-01-31 13:45:00 UTC"
#> [9] "2020-01-31 13:45:00 UTC"

РЕДАКТИРОВАТЬ

Только что наткнулся hms::trunc_hms().

Кажется, дает мне то, что мне нужно для secs = 15 и secs = 30, так что это решит мою непосредственную проблему. Но я не вижу, как это будет работать для единиц времени, отличных от секунд:

library(magrittr)

x <- c(
  "2020-01-31 13:01:14",
  "2020-01-31 13:01:15",
  "2020-01-31 13:01:16",
  "2020-01-31 13:01:29",
  "2020-01-31 13:01:30",
  "2020-01-31 13:01:31",
  "2020-01-31 13:01:44",
  "2020-01-31 13:01:45",
  "2020-01-31 13:01:46",
  "2020-01-31 13:01:59",
  "2020-01-31 13:02:00",
  "2020-01-31 13:02:01"
) %>%
  lubridate::ymd_hms()

x %>% hms::trunc_hms(15)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:15 UTC"
#>  [3] "2020-01-31 13:01:15 UTC" "2020-01-31 13:01:15 UTC"
#>  [5] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [7] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:45 UTC"
#>  [9] "2020-01-31 13:01:45 UTC" "2020-01-31 13:01:45 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

x %>% hms::trunc_hms(30)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [3] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [5] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [7] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [9] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

x %>% hms::trunc_hms(45)
#>  [1] "2020-01-31 13:00:45 UTC" "2020-01-31 13:00:45 UTC"
#>  [3] "2020-01-31 13:00:45 UTC" "2020-01-31 13:00:45 UTC"
#>  [5] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [7] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#>  [9] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"
#> [11] "2020-01-31 13:01:30 UTC" "2020-01-31 13:01:30 UTC"

x %>% hms::trunc_hms(60)
#>  [1] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [3] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [5] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [7] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#>  [9] "2020-01-31 13:01:00 UTC" "2020-01-31 13:01:00 UTC"
#> [11] "2020-01-31 13:02:00 UTC" "2020-01-31 13:02:00 UTC"

РЕДАКТИРОВАТЬ 2

Рассматривая предложение использовать cut()

x %>%
  tibble::enframe() %>%
  dplyr::mutate(
    grp = value %>% group_date(15)
  ) %>% 
  dplyr::group_by(
    grp_2 = cut(as.POSIXct(value, '%Y-%m-%d %H:%M:%S'), '15 secs')
  )
# # A tibble: 12 x 4
# # Groups:   grp [4]
# name value                   grp_2                   grp                
# <int> <dttm>                  <dttm>                  <fct>              
#   1     1 2020-01-31 13:01:14.000 2020-01-31 13:01:00.000 2020-01-31 13:01:14
# 2     2 2020-01-31 13:01:15.000 2020-01-31 13:01:15.000 2020-01-31 13:01:14
# 3     3 2020-01-31 13:01:16.000 2020-01-31 13:01:15.000 2020-01-31 13:01:14
# 4     4 2020-01-31 13:01:29.000 2020-01-31 13:01:15.000 2020-01-31 13:01:29
# 5     5 2020-01-31 13:01:30.000 2020-01-31 13:01:30.000 2020-01-31 13:01:29
# 6     6 2020-01-31 13:01:31.000 2020-01-31 13:01:30.000 2020-01-31 13:01:29
# 7     7 2020-01-31 13:01:44.000 2020-01-31 13:01:30.000 2020-01-31 13:01:44
# 8     8 2020-01-31 13:01:45.000 2020-01-31 13:01:45.000 2020-01-31 13:01:44
# 9     9 2020-01-31 13:01:46.000 2020-01-31 13:01:45.000 2020-01-31 13:01:44
# 10    10 2020-01-31 13:01:59.000 2020-01-31 13:01:45.000 2020-01-31 13:01:59
# 11    11 2020-01-31 13:02:00.000 2020-01-31 13:02:00.000 2020-01-31 13:01:59
# 12    12 2020-01-31 13:02:01.000 2020-01-31 13:02:00.000 2020-01-31 13:01:59

Создано в 2020-01-31 пакетом Представ (v0.3.0)

1 Ответ

2 голосов
/ 31 января 2020

Как насчет этой простой функции для округления до любого заданного числа секунд, минут, часов, дней или недель? Если вы хотите запускать свои чанки с указанной c даты и времени, существует необязательное время начала по умолчанию. Аргументы «единиц» совпадают, поэтому вы можете сокращать их. По умолчанию это секунды

time_group <- function(times, intervals, since = as.POSIXct("2000-01-01"), 
                       units = c("secs", "mins", "hours", "days", "weeks"))
{
  all_units <- c("secs", "mins", "hours", "days", "weeks")
  units     <- match.arg(units, all_units)
  intervals <- intervals * c(1, 60, 3600, 86400, 604800)[match(units, all_units)]
  cuts      <- intervals * floor(as.numeric(difftime(times, since, units = "secs"))/intervals)

  return(as.POSIXct(cuts, origin = since))
}

Это позволяет вам сделать это:

# Units default to seconds so this groups by 15 seconds at a time
time_group(x, 15)
#>  [1] "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:15 GMT" "2020-01-31 13:01:15 GMT"
#>  [4] "2020-01-31 13:01:15 GMT" "2020-01-31 13:01:30 GMT" "2020-01-31 13:01:30 GMT"
#>  [7] "2020-01-31 13:01:30 GMT" "2020-01-31 13:01:45 GMT" "2020-01-31 13:01:45 GMT"
#> [10] "2020-01-31 13:01:45 GMT" "2020-01-31 13:02:00 GMT" "2020-01-31 13:02:00 GMT"

# We have used argument matching so we can abbreviate minutes to "m"
time_group(x, 1, units = "m")
#>  [1] "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT"
#>  [4] "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT"
#>  [7] "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT" "2020-01-31 13:01:00 GMT"
#> [10] "2020-01-31 13:01:00 GMT" "2020-01-31 13:02:00 GMT" "2020-01-31 13:02:00 GMT"

...