Сначала давайте сгенерируем случайные выборочные данные
set.seed(2018)
x <- factor(paste(sample(0:10, 10, replace = T), "YEAR,", sample(0:11, 10, replace = T), "MONTHS"))
df <- data.frame(years_of_experience = x)
# years_of_experience
#1 3 YEAR, 4 MONTHS
#2 5 YEAR, 7 MONTHS
#3 0 YEAR, 11 MONTHS
#4 2 YEAR, 8 MONTHS
#5 5 YEAR, 9 MONTHS
#6 3 YEAR, 7 MONTHS
#7 6 YEAR, 3 MONTHS
#8 1 YEAR, 6 MONTHS
#9 10 YEAR, 8 MONTHS
#10 6 YEAR, 9 MONTHS
Затем мы можем использовать case_when
для бин years_of_experience
на основе лет
df.new <- df %>%
mutate(
yr = as.numeric(gsub(" YEAR.*$", "", x)),
bucket = case_when(
yr < 1 ~ "<1",
yr >= 5 ~ ">=5",
TRUE ~ as.character(yr)))
df.new
# years_of_experience yr bucket
#1 3 YEAR, 4 MONTHS 3 3
#2 5 YEAR, 7 MONTHS 5 >=5
#3 0 YEAR, 11 MONTHS 0 <1
#4 2 YEAR, 8 MONTHS 2 2
#5 5 YEAR, 9 MONTHS 5 >=5
#6 3 YEAR, 7 MONTHS 3 3
#7 6 YEAR, 3 MONTHS 6 >=5
#8 1 YEAR, 6 MONTHS 1 1
#9 10 YEAR, 8 MONTHS 10 >=5
#10 6 YEAR, 9 MONTHS 6 >=5
Мы можем преобразовать df.new$bucket
на factor
, который имеет 5 уровней
df.new %>% mutate(bucket = as.factor(bucket)) %>% pull(bucket)
# [1] 3 >=5 <1 2 >=5 3 >=5 1 >=5 >=5
#Levels: <1 >=5 1 2 3