library(truncnorm)
library(microbenchmark)
num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)
df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0
###########################
# Your example
###########################
Trial_func=function(df1, seed=NULL){
if (!is.null(seed)) set.seed(seed)
for (i in 1:nrow(df1)){
if (df1[i,"Obs_Type"] ==1){
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if (df1[i,"Obs_Type"] ==2){
#If Type == 2; then a=-10, b = Upper_Bound
df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
} else if(df1[i,"Obs_Type"] ==3){
#If Type == 3; then a=Lower_bound, b = Inf
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
} else {
#If Type == 3; then a=Lower_bound, b = 10
df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
}
}
return(df1)
}
#############################
# Vectorized version
#############################
# for each row-elements define a function
truncated_normal <- function(obs_type, lower_bound, upper_bound, mean, sd) {
if (obs_type == 1) {
rtruncnorm(1, a=-Inf, b=upper_bound, mean=mean, sd=sd)
} else if (obs_type == 2){
rtruncnorm(1, a=-10, b=upper_bound, mean=mean, sd=sd)
} else if(obs_type == 3){
rtruncnorm(1, a=lower_bound, b=Inf, mean=mean, sd=sd)
} else {
rtruncnorm(1, a=lower_bound, b=10, mean=mean, sd=sd)
}
}
# vectorize it
truncated_normal <- Vectorize(truncated_normal)
Trial_func_vec <- function(df, res_col="draw_value", seed=NULL) {
if (!is.null(seed)) set.seed(seed)
df[, res_col] <- truncated_normal(
obs_type=df[, "Obs_Type"],
lower_bound=df[, "Lower_bound"],
upper_bound=df[, "Upper_bound"],
mean=df[,"mean"],
sd=1)
df
}
#Benchmarking
set.seed(1)
mbm=microbenchmark(Trial_func(df=df1),times = 10)
summary(mbm)
set.seed(1)
mbm_vec=microbenchmark(Trial_func_vec(df=df1),times = 10)
summary(mbm_vec)
## vectorization roughly 3x faster!
#For obtaining the new data
set.seed(1) # important so that randomization is reproducible
new_data=Trial_func(df=df1)
set.seed(1) # important so that randomization is reproducible
vec_data=Trial_func_vec(df=df1)
# since in both cases random number generator is provoked
# exactly once per row in the order of the rows,
# resulting df should be absolutely identical.
all(new_data == vec_data) ## TRUE! They are absolutely identical.
# proving that your code does - in principle - exactly the same
# like my vectorized code
Результаты сравнительного анализа
# @Rui Barradas' function
Trial_func2 <- function(df1){
i1 <- df1[["Obs_Type"]] == 1
i2 <- df1[["Obs_Type"]] == 2
i3 <- df1[["Obs_Type"]] == 3
i4 <- df1[["Obs_Type"]] == 4
#If Type == 1; then a=-Inf, b = Upper_Bound
df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf,
b = df1[i1, "Upper_bound"],
mean = df1[i1, "mean"], sd = 1)
#If Type == 2; then a=-10, b = Upper_Bound
df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
b = df1[i2 , "Upper_bound"],
mean = df1[i2, "mean"], sd = 1)
#If Type == 3; then a=Lower_bound, b = Inf
df1[i3,"draw_value"] <- rtruncnorm(sum(i3),
a = df1[i3, "Lower_bound"],
b = Inf, mean = df1[i3, "mean"],
sd = 1)
#If Type == 3; then a=Lower_bound, b = 10
df1[i4, "draw_value"] <- rtruncnorm(sum(i4),
a = df1[i4, "Lower_bound"],
b = 10,
mean = df1[i4,"mean"],
sd = 1)
df1
}
# @Dave2e's function
library(dplyr)
Trial_func_dplyr <- function(df1) {
df1[,"draw_value"] <- case_when(
df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
)
df1
}
#Benchmarking
set.seed(1)
mbm <- microbenchmark(
loop = Trial_func(df1=df1),
ruy_vect = Trial_func2(df1=df1),
my_vect = Trial_func_vec(df=df1),
cwhen = Trial_func_dplyr(df1=df1),
times=10)
print(mbm, order = "median")
# > print(mbm, order = "median")
# Unit: milliseconds
# expr min lq mean median uq max
# ruy_vect 7.583821 7.879766 11.59954 8.815835 10.33289 36.60468
# cwhen 22.563190 23.103670 25.13804 23.965722 26.49628 30.63777
# my_vect 1326.771297 1373.415302 1413.75328 1410.995177 1484.28449 1506.11063
# loop 4149.424632 4269.475169 4486.41376 4423.527566 4742.96651 4911.31992
# neval cld
# 10 a
# 10 a
# 10 b
# 10 c
# @Rui's vectorize version wins by 3 magnitudes or order!!