В R для идентификатора группы подсчитывают дни с момента события, но допускают более одного события в день - PullRequest
1 голос
/ 17 февраля 2020
# small data set for chess players with player id (p_id), days from some starting date (dayo)
# an Elo rating (elo) and a result win or loss (fin1_0)

p_id <- 
c(2232,1877,2210,2073,2232,2232,1877,2210,2073,2232,1877,2210,2073,1877,2232,2210,2073,2210,1877)
dayo <- c(750,750,750,750,751,750,750,750,750,751,751,751,751,751,751,751,762,762,762)
elo <- 
c(1741,1841,1651,1561,1751,1741,1841,1651,1561,1751,1851,1551,1951,1820,1660,2230,1762,1762,1762)
fin1_0 <- c(0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0)

my.df <- data.frame(p_id,elo,dayo,fin1_0)
my.df

# for each player I want to determine the number of games played, the number of wins they achieved
# the day of their last win, how many days since they won a game, how many days since they played a 
# game
# and how many games played since they won 

# what I tried
library(dplyr)

tbl.df <- my.df %>%
 # first arrange the data based on p_id and dayo
 arrange(p_id, dayo) %>%
 group_by(p_id) %>%
 mutate( games = 1:n(), 
         wins = cumsum(fin1_0 == 1) ) %>%
 # group by p_id and every win
 group_by(p_id, wins) %>%
 mutate(last_win_day = min(dayo),
 days_since_win = dayo - first(dayo) ) %>%
 group_by(p_id) %>%
 # replace the first value (or NA) in every group with a random value between 1 and 10
 mutate(days_since_win = replace(days_since_win, row_number() == 1, sample(1:10,1) ),
 days_since_game = dayo -lag(dayo),
 days_since_game = replace_na(days_since_game,sample(1:10,1))) %>%
 group_by(p_id, wins) %>%
 mutate(games_since_win = games - first(games))
 # convert back to a data.frame
 my.df <- data.frame(tbl.df)     
 my.df[order(my.df$p_id,my.df$dayo),]
 my.df

# truncated output
   p_id  elo dayo fin1_0 games wins last_win_day days_since_win days_since_game games_since_win
1  1877 1841  750      1     1    1          750             10              10               0
2  1877 1841  750      0     2    1          750              0               0               1
3  1877 1851  751      0     3    1          750              1               1               2
4  1877 1820  751      1     4    2          751              0               0               0
5  1877 1762  762      0     5    2          751             11              11               1
6  2073 1561  750      0     1    0          750              3               7               0
7  2073 1561  750      1     2    1          750              0               0               0
8  2073 1951  751      0     3    1          750              1               1               1
9  2073 1762  762      1     4    2          762              0              11               0

# wanted output
   p_id  elo dayo fin1_0 games wins last_win_day days_since_win days_since_game games_since_win
1  1877 1841  750      1     1    1          750             10              10               0
2  1877 1841  750      0     2    1          750              0               0               0
3  1877 1851  751      0     3    1          750              1               1               1
4  1877 1820  751      1     4    2          751              0               1               0
5  1877 1762  762      0     5    2          751             11              11               11
6  2073 1561  750      0     1    0          750              3               7               0
7  2073 1561  750      1     2    1          750              0               0               0
8  2073 1951  751      0     3    1          750              1               1               1
9  2073 1762  762      1     4    2          762              0              11               0

1 Ответ

0 голосов
/ 17 февраля 2020

Вот подход, использующий case_when для отслеживания последнего дня победы для каждого игрока. Похоже, вы хотите показать ноль дней с момента победы в первый день каждого игрока, даже если они еще не выиграли.

my.df %>%
  arrange(p_id, dayo) %>%
  group_by(p_id) %>%
  mutate( games = 1:n(), 
          wins = cumsum(fin1_0 == 1)) %>%
  mutate(day_of_last_win = case_when(
    row_number() == 1 ~ dayo,
    fin1_0 == 1       ~ dayo,
    TRUE              ~ lag(dayo)),
         days_since_last_win = dayo - day_of_last_win)

.

# A tibble: 19 x 8
# Groups:   p_id [4]
    p_id   elo  dayo fin1_0 games  wins day_of_last_win days_since_last_win
   <dbl> <dbl> <dbl>  <dbl> <int> <int>           <dbl>               <dbl>
 1  1877  1841   750      1     1     1             750                   0
 2  1877  1841   750      0     2     1             750                   0
 3  1877  1851   751      0     3     1             750                   1
 4  1877  1820   751      1     4     2             751                   0
 5  1877  1762   762      0     5     2             751                  11
 6  2073  1561   750      0     1     0             750                   0
 7  2073  1561   750      1     2     1             750                   0
 8  2073  1951   751      0     3     1             750                   1
 9  2073  1762   762      1     4     2             762                   0
10  2210  1651   750      0     1     0             750                   0
11  2210  1651   750      0     2     0             750                   0
12  2210  1551   751      0     3     0             750                   1
13  2210  2230   751      0     4     0             751                   0
14  2210  1762   762      0     5     0             751                  11
15  2232  1741   750      0     1     0             750                   0
16  2232  1741   750      0     2     0             750                   0
17  2232  1751   751      1     3     1             751                   0
18  2232  1751   751      1     4     2             751                   0
19  2232  1660   751      0     5     2             751                   0
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...