Вот подход tidyverse
, который удалит примерно 20% ваших данных для каждого указанного вами столбца:
set.seed(1)
# example data
N <- 20
data <- data.frame(id = 1:N,
age = rnorm(N,18:90),
bmi = rnorm(N,15:40),
chol = rnorm(N,50:350))
library(tidyverse)
# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20
data %>%
gather(var, value, -id) %>% # reshape data
mutate(r = runif(nrow(.)), # simulate a random number from 0 to 1 for each row
value = ifelse(var %in% c_names & r <= prc_missing, NA, value)) %>% # if it's one of the variables you specified and the random number is less than your threshold update to NA
select(-r) %>% # remove random number
spread(var, value) # reshape back to original format
# id age bmi chol
# 1 1 17.37355 15.91898 49.83548
# 2 2 19.18364 16.78214 50.74664
# 3 3 19.16437 17.07456 52.69696
# 4 4 NA 16.01065 53.55666
# 5 5 22.32951 19.61983 53.31124
# 6 6 22.17953 19.94387 54.29250
# 7 7 24.48743 NA 56.36458
# 8 8 25.73832 20.52925 57.76853
# 9 9 26.57578 NA 57.88765
# 10 10 26.69461 24.41794 59.88111
# 11 11 29.51178 26.35868 60.39811
# 12 12 NA 25.89721 60.38797
# 13 13 NA 27.38767 62.34112
# 14 14 28.78530 27.94619 61.87064
# 15 15 33.12493 27.62294 65.43302
# 16 16 32.95507 NA 66.98040
# 17 17 33.98381 30.60571 65.63278
# 18 18 35.94384 NA 65.95587
# 19 19 36.82122 34.10003 68.56972
# 20 20 37.59390 34.76318 68.86495
И это альтернатива, которая удалит ровно 20% данных для столбцов, которые выуказать:
set.seed(1)
# example data
N <- 20
data <- data.frame(id = 1:N,
age = rnorm(N,18:90),
bmi = rnorm(N,15:40),
chol = rnorm(N,50:350))
library(tidyverse)
# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20
n_remove = prc_missing * nrow(data)
data %>%
gather(var, value, -id) %>% # reshape data
sample_frac(1) %>% # shuffle rows
group_by(var) %>% # for each variables
mutate(value = ifelse(var %in% c_names & row_number() <= n_remove, NA, value)) %>% # update to NA top x number of rows if it's one of the variables you specified
spread(var, value) # reshape to original format
# # A tibble: 20 x 4
# id age bmi chol
# <int> <dbl> <dbl> <dbl>
# 1 1 17.4 15.9 49.8
# 2 2 19.2 16.8 50.7
# 3 3 19.2 17.1 52.7
# 4 4 NA 16.0 53.6
# 5 5 22.3 NA 53.3
# 6 6 22.2 19.9 54.3
# 7 7 24.5 20.8 56.4
# 8 8 25.7 NA 57.8
# 9 9 26.6 NA 57.9
# 10 10 NA NA 59.9
# 11 11 NA 26.4 60.4
# 12 12 NA 25.9 60.4
# 13 13 29.4 27.4 62.3
# 14 14 28.8 27.9 61.9
# 15 15 33.1 27.6 65.4
# 16 16 33.0 29.6 67.0
# 17 17 34.0 30.6 65.6
# 18 18 35.9 31.9 66.0
# 19 19 36.8 34.1 68.6
# 20 20 37.6 34.8 68.9