Tibble не работает, чтобы получить дату для обнаружения аномалии (воспроизводимый пример) - PullRequest
0 голосов
/ 15 декабря 2018

У меня есть набор данных ниже:

dput(stack)

structure(list(Day = structure(c(17836, 17837, 17838, 17840, 
17840, 17841, 17842, 17843, 17843, 17843, 17844, 17844, 17844, 
17845, 17845, 17845, 17846, 17846, 17847, 17847, 17847, 17848, 
17848, 17848, 17849, 17849, 17850, 17850, 17850, 17850, 17851, 
17851, 17851, 17851, 17852, 17852, 17852, 17852, 17853, 17853, 
17853, 17853, 17854, 17854, 17854, 17854, 17855, 17855, 17855, 
17855, 17856, 17856, 17856, 17856, 17857, 17857, 17857, 17857, 
17858, 17858, 17858, 17858, 17859, 17859, 17859, 17859, 17860, 
17860, 17860, 17860, 17860, 17861, 17861, 17861, 17861, 17862, 
17862, 17862, 17862, 17863, 17863, 17863, 17863, 17864, 17864, 
17864, 17864, 17865, 17865, 17865, 17865, 17866, 17866, 17866, 
17866, 17867, 17867, 17867, 17867, 17868, 17868, 17868, 17868, 
17869, 17869, 17869, 17869, 17869, 17870, 17870, 17870, 17870, 
17871, 17871, 17871, 17871, 17871, 17871, 17871, 17872, 17872, 
17872, 17872, 17872, 17872, 17872, 17873, 17873, 17873, 17873, 
17873, 17873, 17874, 17874, 17874, 17874, 17874, 17874, 17875, 
17875, 17875, 17875, 17875, 17875, 17876, 17876, 17876, 17876, 
17876, 17876, 17877, 17877, 17877, 17877, 17877, 17877, 17878, 
17878, 17878, 17878, 17878, 17878), class = "Date"), Channel = c("D", 
"A", "D", "A", "D", "D", "D", "A", "B", "D", "A", "B", "D", "A", 
"B", "D", "A", "D", "A", "B", "D", "A", "D", "G", "A", "D", "A", 
"D", "F", "G", "A", "D", "F", "G", "A", "D", "F", "G", "A", "D", 
"F", "G", "A", "D", "F", "G", "A", "D", "F", "G", "A", "D", "F", 
"G", "A", "D", "F", "G", "A", "D", "F", "G", "A", "D", "F", "G", 
"A", "B", "D", "F", "G", "A", "D", "F", "G", "A", "D", "F", "G", 
"A", "D", "F", "G", "A", "D", "F", "G", "A", "D", "F", "G", "A", 
"D", "F", "G", "A", "D", "F", "G", "A", "D", "F", "G", "A", "D", 
"E", "F", "G", "A", "D", "F", "G", "A", "B", "C", "D", "E", "F", 
"G", "A", "B", "C", "D", "E", "F", "G", "A", "C", "D", "E", "F", 
"G", "A", "C", "D", "E", "F", "G", "A", "C", "D", "E", "F", "G", 
"A", "C", "D", "E", "F", "G", "A", "C", "D", "E", "F", "G", "A", 
"C", "D", "E", "F", "G"), Orders = c(0, 0, 0, 2, 0, 0, 0, 0, 
0, 0, 9, 0, 0, 36, 0, 1, 46, 1, 11, 0, 0, 5, 0, 0, 112, 0, 66, 
1, 765, 2, 133, 0, 2290, 9, 524, 2, 6417, 8, 33, 0, 1008, 7, 
70, 1, 1860, 12, 62, 3, 1141, 12, 67, 5, 1059, 13, 14, 2, 481, 
13, 84, 6, 1693, 23, 370, 16, 6028, 51, 29, 0, 3, 640, 11, 5, 
0, 569, 12, 31, 2, 730, 16, 45, 4, 961, 27, 71, 2, 594, 55, 155, 
10, 607, 48, 224, 30, 4298, 66, 50, 0, 587, 92, 27, 1, 728, 80, 
195, 3, 0, 1310, 62, 24, 6, 754, 46, 18, 0, 0, 2, 0, 439, 50, 
107, 0, 0, 7, 0, 981, 55, 130, 0, 11, 0, 2784, 83, 61, 0, 4, 
0, 843, 77, 110, 0, 0, 0, 885, 63, 70, 0, 3, 0, 481, 77, 81, 
0, 0, 0, 833, 83, 38, 0, 1, 1, 712, 88)), row.names = c(NA, -162L
), spec = structure(list(cols = list(Day = structure(list(), class = 
c("collector_character", 
"collector")), Channel = structure(list(), class = c("collector_character", 
"collector")), Orders = structure(list(), class = c("collector_double", 
"collector"))), default = structure(list(), class = c("collector_guess", 
"collector"))), class = "col_spec"), class = c("tbl_df", "tbl", 
"data.frame"))

Ниже приведены первые несколько строк:

head(stack)
 # A tibble: 6 x 3
   Day        Channel Orders
   <date>     <chr>    <dbl>
 1 2018-11-01 D            0
 2 2018-11-02 A            0
 3 2018-11-03 D            0
 4 2018-11-05 A            2
 5 2018-11-05 D            0
 6 2018-11-06 D            0

Я хочу использовать пакет аномализаций для просмотра аномалий по группам.

Исходя из приведенной ниже ссылки, я хочу нанести каждый спектакль по группам https://www.rdocumentation.org/packages/anomalize/versions/0.1.1

Это моя попытка с предоставленным набором данных.

stack %>%
  # Data Manipulation / Anomaly Detection
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  # Anomaly Visualization
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

Вот моя ошибка:

Converting from tbl_df to tbl_time.
Auto-index message: index = Day
Error in mutate_impl(.data, dots) : 
  Evaluation error: Only year, quarter, month, week, and day periods are 
allowed for an index of class Date.

Ожидаемый результат должен быть примерно таким, исходя из ссылки на сайт:

enter image description here

1 Ответ

0 голосов
/ 15 декабря 2018

Проблема в том, что ваша переменная даты не уникальна.

all(length(unique(stack$Day))==length(stack$Day))

Ошибка декомпозиции по времени, поскольку данные не равны временному ряду с равными интервалами

Evaluation error: Only year, quarter, month, week, and day periods are 
allowed for an index of class Date.

Мы знаем, что ошибка относится к time_decompose, потому что ошибка сначала возникает после этой строкикод:

stack %>%
    # Data Manipulation / Anomaly Detection
    time_decompose(Orders, method = "stl") 

Converting from tbl_df to tbl_time.
Auto-index message: index = Day
Error in mutate_impl(.data, dots) : 
  Evaluation error: Only year, quarter, month, week, and day periods are allowed for an index of class Date.

Когда мы group_by по дням и суммируем значения в переменной Orders, мы получаем уникальные интервалы времени.Код не показывает ошибок:

stack %>%
  group_by(Day) %>%
  summarise(Orders = sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  # Anomaly Visualization
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25)

Я подозреваю, что вам нужен временной ряд на Channel.Если это так, вы столкнетесь с некоторыми проблемами, потому что недостаточно данных для разложения трендов для каждого канала.Каналы B, C и E не могут быть разложены.См. Код ниже:

library(tidyverse)
library(dplyr)
a<-stack %>%
  filter(Channel=="A") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>% #in case of repeat dates
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

Нет проблем с.Теперь давайте проверим b:

b<-stack %>%
  filter(Channel=="B") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

Converting from tbl_df to tbl_time.
Auto-index message: index = Day
frequency = 1.5 weeks
trend = 7 weeks
Error in stats::stl(., s.window = "periodic", t.window = trnd, robust = TRUE) : 
  series is not periodic or has less than two periods

Ошибка говорит нам обо всем.Посмотрим, как долго b

b
# A tibble: 7 x 3
  Day        Channel Orders
  <date>     <chr>    <dbl>
1 2018-11-08 B            0
2 2018-11-09 B            0
3 2018-11-10 B            0
4 2018-11-12 B            0
5 2018-11-25 B            0
6 2018-12-06 B            0
7 2018-12-07 B            0

Это всего лишь 7 наблюдений.Недостаточно для тренда

c<-stack %>%
  filter(Channel=="C") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

То же самое для c.Теперь давайте посмотрим d, e, f и g.

d<-stack %>%
  filter(Channel=="D") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")
e<-stack %>%
  filter(Channel=="E") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")
f<-stack %>%
  filter(Channel=="F") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")
g<-stack %>%
  filter(Channel=="G") %>%
  group_by(Day) %>%
  summarise(Orders=sum(Orders)) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

Только каналы a, d, f, g имеют достаточно наблюдений для разложения тренда.

library(gridExtra)
grid.arrange(a,d,f,g, nrow=2)

enter image description here

Вы также можете выполнить вышеуказанное, используя одну строку кода (я просто хотел объяснить, почему генерировались ошибки).

stack %>%
  filter(Channel=="A"| Channel=="D"|Channel=="F"|Channel=="G") %>%
  group_by(Channel) %>%
  time_decompose(Orders, method = "stl") %>%
  anomalize(remainder, method = "iqr") %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE, ncol = 2, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods")

enter image description here

...