Вот еще одна попытка использования тидиверса. Данные о «заклинаниях» - это обычное преобразование данных панели; В подходе Tidyverse уловка, которую я думаю, состоит в том, чтобы генерировать переменную заклинания, такую как переменная «run» в оригинальном коде OP.
# libraries
library(tidyverse)
library(zoo)
library(lubridate)
# example dataset
ID <- c(1:10, 1:10, 1:10)
date <- c("2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01",
"2006-09-01", "2006-09-01", "2006-09-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01",
"2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-11-01",
"2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01",
"2006-11-01", "2006-11-01")
act <- c("Unemployed", "Employment", "Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Unemployed", "Education", "Unemployed", "Unemployed", "Unemployed",
"Education", "Education", "Employment", "Education", "Education", "NA", "Unemployed",
"Unemployed", "NA", "Unemployed", "Education", "Employment", "Employment", "NA", "Education",
"Unemployed")
df <- data.frame(ID, date, act)
df[order(ID),]
# convert types of some variables (in particular use zoo::yearmon instead of date, since these are actually yearmonth combos)
df$act <- as.character(df$act)
df$date <- lubridate::ymd(df$date)
df$yearmon <- zoo::as.yearmon(df$date)
df$act <- ifelse(df$act=='NA',NA,df$act)
# construct "act2", which is act, except when an NA is surrounded by the SAME act before and after, it is replaced with that same act
# e.g. Unemployed NA Unemployed -> Unemployed Unemployed Unemployed
# e.g. Education NA Unemployed -> stays the same
# (see note at the end of this discussion for more details on this)
df <- df %>% arrange(ID,date)
df <- df %>% group_by(ID) %>% mutate(
act2 = ifelse(is.na(act) & (lag(act)==lead(act)), lead(act), act)
)
# create "spell" variable, which is like the "run" variable in the example code
# within ID this identifies the spell that is currently taken place
# --- this is the most important part of the code ---
df <- df %>% group_by(ID) %>% mutate(
spell = cumsum(coalesce(is.na(act2) | act2!=lag(act2),FALSE)) + 1
)
# add yearmonth + 1 month, in order to do duration calculations
# (I'm again exploiting the fact that your data is monthly. if this were not true, this variable could be lead(date), within ID. but then we'd have to figure out how to deal with ends of the panel, where lead(date) is NA)
df$yearmonplusmonth <- df$yearmon + (1/12)
# construct a dataset of ID-spell combinations
spells <- df %>% group_by(ID,spell) %>% summarize(
spelltype = first(act2),
duration = (max(yearmonplusmonth) - min(yearmon))*12
)
# construct a dataset at the ID level, with desired summaries of spells
spellsummary <- spells %>% group_by(ID,spelltype) %>% summarize(
spell_count = n(),
duration = sum(duration)
)
# if there are no spells of a given spelltype, it doesn't appear in spellsummary
# we need to fill out spellsummary with zeroes in ID-spelltype cases where there are no spells:
temp <- expand.grid(ID = unique(spellsummary$ID), spelltype = unique(spellsummary$spelltype))
spellsummary <- full_join(spellsummary,temp,by=c('ID','spelltype'))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(coalesce(as.numeric(.),0)))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(round(.,0)))
spellsummary <- spellsummary %>% arrange(ID,spelltype)
# finally, we just want Unemployed spelltype summaries by ID:
spellsummary %>% filter(spelltype=='Unemployed')
# A tibble: 10 x 4
# Groups: ID [10]
# ID spelltype spell_count duration
# <int> <chr> <dbl> <dbl>
# 1 1 Unemployed 2 2
# 2 2 Unemployed 1 2
# 3 3 Unemployed 1 1
# 4 4 Unemployed 1 2
# 5 5 Unemployed 0 0
# 6 6 Unemployed 0 0
# 7 7 Unemployed 0 0
# 8 8 Unemployed 0 0
# 9 9 Unemployed 0 0
# 10 10 Unemployed 1 3
Примечание: я получаю 3 за длительность в последнем ряду, а не 2 в желаемом выводе ОП. Причина в том, что я предполагаю, что Unemp NA Unemp действительно Unemp Unemp Unemp, как для целей spell_count, так и для целей длительности. ОП хочет, чтобы это имело место для spell_count, но не для продолжительности. Чтобы достичь этого, одним из подходов может быть использование переменной «act» для вычислений продолжительности и переменной «act2» для вычислений spell_count - я оставляю это читателю.