Я не знаю, так ли это, как вы хотели, но уверен, что в нем гораздо меньше подробного кода, используя функцию cut()
.
data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
# labels for each category
К сожалению, вам нужны эти две строкидля преобразования NA
в "Unknown"
:
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"
data
# raw number category
# 1 0 0 0
# 2 13 13 [8..14]
# 3 NULL NA Unknown
Данные:
raw <- c('0', '13', 'NULL')
data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))
Сравнительный анализ:
microbenchmark::microbenchmark(
#cut
cut = {data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf),
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"},
#findInt
findInt = {vec<-c(0,7,14,31,62)
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
paste0(vec[length(vec)]+1,"++"))
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
res[is.na(res)]<-"unknown"},
# lapply
lapply = {data$category <- lapply(data$number,function(x) {
if(is.na(x) || is.null(x)) "unknown"
else if(x == 0) "0"
else if(x > 0 & x <= 7) "[1 ... 7]"
else if(x > 7 & x <= 14) "[8 ... 14]"
else if(x > 14 & x <= 31) "[15 ... 31]"
else if(x > 31 & x <= 62) "[32 ... 62]"
else if(x > 62) "63++"
else "unknown"
})},
# ifelse
ifelse = {data$category <-
ifelse(is.na(data$number), "unknown",
ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++", "???")))))))}
)
Дает:
# Unit: microseconds
# expr min lq mean median uq max neval
# cut 132.207 139.4185 154.78149 144.9770 154.5925 283.043 100
# findInt 18.329 21.7850 26.58004 26.2915 28.8460 60.996 100
# lapply 14.122 15.6250 4269.73574 17.2770 18.7800 425198.055 100
# ifelse 81.728 84.8835 96.09675 88.9400 96.3010 193.503 100