Превратить трубопровод dplyr в функцию - PullRequest
1 голос
/ 17 марта 2019

Приведенный ниже код работает с моими примерами данных для одного значения 50, но у меня возникают проблемы с созданием способа параметризации результатов с помощью переменной click.amount, в частности, диапазона 20: 200. Определения переменных:

  • дата = метка времени события

  • ID = однозначно определяет пользователя

  • Группа = группирующая переменная

  • click = клики за событие (всегда = 1)

  • click.amount = сумма, потраченная на мероприятие

  • Correct.Outcome = что должен вернуть мой код

  • Причина = причина, по которой результат будет "N"

Критерий для события, чтобы иметь результат = "Y"

  1. количество кликов <= N </li>
  2. совокупных кликов по идентификатору <= 5 </li>
  3. совокупная сумма по идентификатору <= N </li>
  4. Группа = A

Для примера кода N = 50, в конечном счете, я хотел бы вернуть фрейм данных, в котором строки представляют пропорцию Outcome = "Y" и общее количество click.amount для Outcome = "Y" для каждого click.amount в диапазоне 20: 200. Мои фактические данные составляют от 500К до 1М строк. Если существует более чистое решение без использования dplyr, то код ниже показывает, как мне удалось получить частичное решение. ТИА

данные:

    structure(list(date = c(1550245200, 1550245211, 1550245411, 1552566300, 
1552566350, 1552567300, 1552567400, 1552567500, 1552568600, 1552568600, 
1552568601, 1552568691, 1552568691, 1552568891, 1552568899, 1552568601, 
1552568691, 1552568691, 1552568891, 1552568899), ID = c(1, 1, 
1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 6, 7, 8), Group = c("A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "B", "B", "B", "B", "B"), clicks = c(1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), click.amount = c(60, 
4.89, 12.84, 70, 52, 10, 33, 80, 54, 11.89, 15.24, 25, 2, 1, 
1, 15.24, 25, 2, 1, 1), Correct.Outcome = c("N", "Y", "Y", "N", 
"N", "Y", "Y", "N", "N", "Y", "Y", "N", "N", "N", "N", "N", "N", 
"N", "N", "N"), Reason = c("over 50", NA, NA, "over 50", "over 50", 
NA, NA, "cumulative amount over 50", "over 50", NA, NA, "cumulative amount over 50", 
"cumulative amount over 50", "cumulative amount over 50", "cumulative clicks over 5 and cumulative amount over 50", 
"Group B", "Group B", "Group B", "Group B", "Group B")), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

код:

 results=testdat%>% group_by(ID)%>%arrange(date)%>%
  mutate(cum.clicks=cumsum(ifelse(click.amount<=50 & Group=="A",clicks,0)),
         cum.clickamount=cumsum(ifelse(click.amount<=50 & Group=="A",click.amount,0)))%>%
  mutate(Outcome=ifelse(click.amount<=50 & Group=="A" & cum.clicks<=5 & cum.clickamount<=50,"Y","N" ))%>%
  select(Outcome,Correct.Outcome,Reason,everything())%>%
  arrange(ID,date)%>%
  mutate(check=ifelse(Outcome !=Correct.Outcome,1,0))%>% ungroup()%>%
  summarize(prop50=mean(Outcome=="Y"),amount50=sum(click.amount[Outcome=="Y"]))

1 Ответ

2 голосов
/ 17 марта 2019

Я не уверен, что понимаю, что вы делаете, но превратить dplyr в функцию можно с помощью sapply.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
testdata <- data.frame(stringsAsFactors=FALSE,
                       date = c(1550245200, 1550245211, 1550245411, 1552566300,
                                1552566350, 1552567300, 1552567400,
                                1552567500, 1552568600, 1552568600, 1552568601,
                                1552568691, 1552568691, 1552568891, 1552568899,
                                1552568601, 1552568691, 1552568691, 1552568891,
                                1552568899),
                       ID = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 6, 7,
                              8),
                       Group = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
                                 "A", "A", "A", "A", "B", "B", "B", "B",
                                 "B"),
                       clicks = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                  1),
                       click.amount = c(60, 4.89, 12.84, 70, 52, 10, 33, 80, 54, 11.89, 15.24,
                                        25, 2, 1, 1, 15.24, 25, 2, 1, 1),
                       Correct.Outcome = c("N", "Y", "Y", "N", "N", "Y", "Y", "N", "N", "Y", "Y",
                                           "N", "N", "N", "N", "N", "N", "N", "N",
                                           "N"),
                       Reason = c("over 50", NA, NA, "over 50", "over 50", NA, NA,
                                  "cumulative amount over 50", "over 50", NA,
                                  NA, "cumulative amount over 50",
                                  "cumulative amount over 50", "cumulative amount over 50",
                                  "cumulative clicks over 5 and cumulative amount over 50", "Group B", "Group B", "Group B",
                                  "Group B", "Group B"))
results <- sapply(X = 20:200,
                  FUN = function(t) {
                    testdata %>%
                      group_by(ID) %>%
                      arrange(date) %>%
                      mutate(cum.clicks = cumsum(x = ifelse(test = ((click.amount <= t) & (Group == "A")),
                                                            yes = clicks,
                                                            no = 0)),
                             cum.clickamount = cumsum(x = ifelse(test = ((click.amount <= t) & (Group=="A")),
                                                                 yes = click.amount,
                                                                 no = 0))) %>%
                      mutate(Outcome = ifelse(test = ((click.amount <= t) & (Group == "A") & (cum.clicks <= 5) & (cum.clickamount <= t)),
                                              yes = "Y",
                                              no = "N" )) %>%
                      select(Outcome, Correct.Outcome, Reason, everything()) %>%
                      arrange(ID, date) %>%
                      mutate(check = ifelse(test = (Outcome !=Correct.Outcome),
                                            yes = 1,
                                            no = 0)) %>%
                      ungroup()%>%
                      summarize(proportion = mean(Outcome == "Y"),
                                amount = sum(click.amount[Outcome=="Y"]))
                  })
results
#>            [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9]  [,10]
#> proportion 0.2   0.2   0.2   0.2   0.2   0.2   0.2   0.2   0.25  0.25 
#> amount     39.62 39.62 39.62 39.62 39.62 39.62 39.62 39.62 54.86 54.86
#>            [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
#> proportion 0.25  0.25  0.25  0.25  0.25  0.25  0.25  0.25  0.25  0.25 
#> amount     54.86 54.86 54.86 54.86 54.86 54.86 54.86 54.86 54.86 54.86
#>            [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30]
#> proportion 0.25  0.25  0.25  0.3   0.3   0.3   0.3   0.3   0.3   0.3  
#> amount     54.86 54.86 54.86 87.86 87.86 87.86 87.86 87.86 87.86 87.86
#>            [,31] [,32] [,33]  [,34]  [,35]  [,36]  [,37]  [,38]  [,39] 
#> proportion 0.3   0.3   0.35   0.4    0.4    0.45   0.5    0.5    0.5   
#> amount     87.86 87.86 139.86 164.86 164.86 166.86 167.86 167.86 167.86
#>            [,40]  [,41]  [,42]  [,43]  [,44]  [,45]  [,46]  [,47]  [,48] 
#> proportion 0.5    0.45   0.45   0.45   0.45   0.45   0.5    0.5    0.5   
#> amount     167.86 210.13 210.13 210.13 210.13 210.13 215.02 215.02 215.02
#>            [,49]  [,50]  [,51]  [,52]  [,53]  [,54]  [,55]  [,56]  [,57] 
#> proportion 0.5    0.5    0.5    0.5    0.5    0.5    0.5    0.5    0.5   
#> amount     215.02 215.02 233.02 233.02 233.02 233.02 233.02 233.02 233.02
#>            [,58]  [,59]  [,60]  [,61]  [,62]  [,63]  [,64]  [,65]  [,66] 
#> proportion 0.5    0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55  
#> amount     233.02 245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86
#>            [,67]  [,68]  [,69]  [,70]  [,71]  [,72]  [,73]  [,74]  [,75] 
#> proportion 0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55  
#> amount     245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86
#>            [,76]  [,77]  [,78]  [,79]  [,80]  [,81]  [,82]  [,83]  [,84] 
#> proportion 0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55  
#> amount     245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86
#>            [,85]  [,86]  [,87]  [,88]  [,89]  [,90]  [,91]  [,92]  [,93] 
#> proportion 0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55  
#> amount     245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86
#>            [,94]  [,95]  [,96]  [,97]  [,98]  [,99]  [,100] [,101] [,102]
#> proportion 0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55   0.55  
#> amount     245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86 245.86
#>            [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111]
#> proportion 0.6    0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     297.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,112] [,113] [,114] [,115] [,116] [,117] [,118] [,119] [,120]
#> proportion 0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128] [,129]
#> proportion 0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,130] [,131] [,132] [,133] [,134] [,135] [,136] [,137] [,138]
#> proportion 0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,139] [,140] [,141] [,142] [,143] [,144] [,145] [,146] [,147]
#> proportion 0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,148] [,149] [,150] [,151] [,152] [,153] [,154] [,155] [,156]
#> proportion 0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65   0.65  
#> amount     377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86 377.86
#>            [,157] [,158] [,159] [,160] [,161] [,162] [,163] [,164] [,165]
#> proportion 0.65   0.7    0.7    0.7    0.7    0.7    0.7    0.7    0.7   
#> amount     377.86 431.86 431.86 431.86 431.86 431.86 431.86 431.86 431.86
#>            [,166] [,167] [,168] [,169] [,170] [,171] [,172] [,173] [,174]
#> proportion 0.7    0.7    0.7    0.7    0.7    0.7    0.7    0.7    0.7   
#> amount     431.86 431.86 431.86 431.86 431.86 431.86 431.86 431.86 431.86
#>            [,175] [,176] [,177] [,178] [,179] [,180] [,181]
#> proportion 0.7    0.7    0.7    0.7    0.7    0.7    0.7   
#> amount     431.86 431.86 431.86 431.86 431.86 431.86 431.86

Создано в 2019-03-17 Представить пакет (v0.2.1)

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...