В группе R и количество дней с момента события - PullRequest
0 голосов
/ 07 февраля 2020
# Small data set for chess players with player id (p_id), days from some starting date (dayo)
# an Elo rating (elo) and a result win or loss (fin1_0)

# I am building a predictive model that includes a feature that is days since the player last won 
# (days_since_win)

# I would like to set for each player a default value, say 9, if the player had no previous games 
# recorded

# The output for player 1877 is shown but my code does not reset after a win and fails to account for 
# a player having more than one game on a particular day
# I have reviewed similar group and count questions and answers for several days but am not making 
# progress.
# Any assistance appreciated!

библиотека (dplyr) # любой альтернативный пакет в порядке.

p_id <-c(2232,1877,2210,2073,2232,1877,2210,2073,3332,2961,2283,1877,2232,3332,2210,2073,2210,2283,1877,2232,2073,2210,2283,1877,2232,2961,2232,1877,2210,2073,3332,2961,2283,2232,1877,2210,2073,3332,1877,2210,
        2073,3332,2232,1877,2210)
dayo <- c(750,750,750,750,751,751,751,751,751,751,751,760,760,760,760,760,762,762,762,762,763,763,763,763,763,763,766,766,766,766,766,766,766,767,767,767,767,767,769,769,769,769,769,769,769)
elo <- c(1741,1841,1651,1561,1751,1851,1651,1551,2251,2051,1951,1760,1820,1660,1569,2230,1762,1762,1762,1762,1763,1763,            1763,1763,1763,1763,1755,1856,1657,1558,2250,2054,1959,1757,1758,2150,1954,1950,1759,2159,1959,1957,1741,1841,1651)
fin1_0 <- c(0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1)
my.df <- data.frame(p_id,elo,dayo,fin1_0)
my.df

# try, here I group by the player's id and an indicator that looks for a player's loss and sums 
# the losses
tbl_df <- my.df %>%
    group_by(p_id,grp = cumsum(c(1,diff(fin1_0 !=1))) ) %>%
    mutate(days_since_win = seq(n()) )
my.df <- data.frame(tbl_df)
my.df <- my.df[order(my.df$p_id,my.df$dayo),]
my.df

# truncated output
#   p_id  elo dayo fin1_0 grp days_since_win
#2  1877 1841  750      0   1              1
#6  1877 1851  751      0   1              2
#12 1877 1760  760      0   1              3
#19 1877 1762  762      0   1              4
#24 1877 1763  763      1   0              1
#28 1877 1856  766      0   1              5
#35 1877 1758  767      1   0              2
#39 1877 1759  769      0   1              6
#44 1877 1841  769      0   1              7

# wanted output
#   p_id  elo dayo fin1_0 grp days_since_win
#2  1877 1841  750      0   1              9  # default value for a player's first game
#6  1877 1851  751      0   1              1   # played 1 day ago and no win
#12 1877 1760  760      0   1              10  # no win for 10 days
#19 1877 1762  762      0   1              12  # no winfor 12 days
#24 1877 1763  763      1   0              0   # player won this day
#28 1877 1856  766      0   1              3   # no win for three days
#35 1877 1758  767      1   0              0   # player won this day
#39 1877 1759  769      0   1              2   # no win for two days
#44 1877 1841  769      0   1              2   # no win for two days

Ответы [ 2 ]

1 голос
/ 07 февраля 2020

Сначала мы можем arrange данные, основанные на p_id и dayo, group_by p_id и каждом выигрыше, мы можем вычесть текущий день из дня, в который клиент впервые присоединился. Затем мы можем replace первое значение в каждой группе до 9.

library(dplyr)

my.df %>%
  arrange(p_id, dayo) %>%
  group_by(p_id, grp = cumsum(fin1_0 == 1)) %>%
  mutate(days_since_win = dayo - first(dayo)) %>%
  group_by(p_id) %>%
  mutate(days_since_win = replace(days_since_win, row_number() == 1, 9)) %>%
  select(-grp)

#    p_id   elo  dayo fin1_0 days_since_win
#   <dbl> <dbl> <dbl>  <dbl>          <dbl>
# 1  1877  1841   750      0              9
# 2  1877  1851   751      0              1
# 3  1877  1760   760      0             10
# 4  1877  1762   762      0             12
# 5  1877  1763   763      1              0
# 6  1877  1856   766      0              3
# 7  1877  1758   767      1              0
# 8  1877  1759   769      0              2
# 9  1877  1841   769      0              2
#10  2073  1561   750      1              9
# … with 35 more rows
0 голосов
/ 07 февраля 2020

Ответ Ронака великолепен. Вот аналогичный, хотя и более подробный, ответ dplyr.

my.df %>% 
  group_by(p_id) %>% 
  arrange(dayo) %>%
  mutate(rownum = 1:n(),
         wincount = cumsum(fin1_0)) %>% 
  ungroup() %>% 
  group_by(p_id, wincount) %>% 
  mutate(last_win_day = min(dayo)) %>% 
  ungroup() %>% 
  mutate(days_since_win = ifelse(rownum==1,9,dayo-last_win_day)) %>% 
  select(p_id, elo, dayo, fin1_0, days_since_win) %>% 
  arrange(p_id, dayo)

    p_id   elo  dayo fin1_0 days_since_win
   <dbl> <dbl> <dbl>  <dbl>          <dbl>
 1  1877  1841   750      0              9
 2  1877  1851   751      0              1
 3  1877  1760   760      0             10
 4  1877  1762   762      0             12
 5  1877  1763   763      1              0
 6  1877  1856   766      0              3
 7  1877  1758   767      1              0
 8  1877  1759   769      0              2
 9  1877  1841   769      0              2
10  2073  1561   750      1              9
# … with 35 more rows

может представлять интерес адаптация последней строки мутирования к следующему:

mutate(days_since_win = ifelse(rownum==1,9,dayo-last_win_day),
days_since_last_game = dayo - lag(day,1),
games_since_last_win = rownum)

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...