Я борюсь с этим. Мне нужно создать таблицу (или фрейм данных) подсчета и процентного содержания NA по столбцам в моей базе данных поездов. Я пытаюсь сделать с этим циклом:
df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("Variable_name", "%NA is_pass=1", "%NA is_pass=0"))
for (i in names(train)){
v <- c(
i,
round(sum(train[,c(i,"is_pass")]$is_pass == 1 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 1)*100,2),
round(sum(train[,c(i,"is_pass")]$is_pass == 0 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 0)*100,2)
)
df <- rbind(df,v)
}
Тем не менее, я продолжаю иметь эту ошибку (для упрощения я просто поместил для первых двух переменных:
Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_id"):
“invalid factor level, NA generated”
Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_type"):
“invalid factor level, NA generated”
Просмотр данных:
> head(train)
id program_id program_type program_duration test_id test_type
1 9389_150 Y_1 Y 136 150 offline
2 16523_44 T_1 T 131 44 offline
3 13987_178 Z_2 Z 120 178 online
4 13158_32 T_2 T 117 32 offline
5 10591_84 V_3 V 131 84 offline
6 12531_23 T_3 T 134 23 offline
difficulty_level trainee_id gender education city_tier age
1 intermediate 9389 M Matriculation 3 24
2 easy 16523 F High School Diploma 4 26
3 easy 13987 M Matriculation 1 40
4 easy 13158 F Matriculation 3 NA
5 intermediate 10591 F High School Diploma 1 42
6 intermediate 12531 F High School Diploma 1 29
total_programs_enrolled is_handicapped trainee_engagement_rating is_pass
1 5 N 1 0
2 2 N 3 1
3 1 N 2 1
4 4 N 1 1
5 2 N 4 1
6 4 N 2 0
Пример данных:
structure(list(program_id = structure(c(16L, 3L, 21L, 4L, 11L,
5L), .Label = c("S_1", "S_2", "T_1", "T_2", "T_3", "T_4", "U_1",
"U_2", "V_1", "V_2", "V_3", "V_4", "X_1", "X_2", "X_3", "Y_1",
"Y_2", "Y_3", "Y_4", "Z_1", "Z_2", "Z_3"), class = "factor"),
program_type = structure(c(6L, 2L, 7L, 2L, 4L, 2L), .Label = c("S",
"T", "U", "V", "X", "Y", "Z"), class = "factor"), program_duration = c(136L,
131L, 120L, 117L, 131L, 134L), test_id = c(150L, 44L, 178L,
32L, 84L, 23L), test_type = structure(c(1L, 1L, 2L, 1L, 1L,
1L), .Label = c("offline", "online"), class = "factor"),
difficulty_level = structure(c(3L, 1L, 1L, 1L, 3L, 3L), .Label = c("easy",
"hard", "intermediate", "vary hard"), class = "factor"),
trainee_id = c(9389L, 16523L, 13987L, 13158L, 10591L, 12531L
), gender = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("F",
"M"), class = "factor"), education = structure(c(4L, 2L,
4L, 4L, 2L, 2L), .Label = c("Bachelors", "High School Diploma",
"Masters", "Matriculation", "No Qualification"), class = "factor"),
city_tier = c(3L, 4L, 1L, 3L, 1L, 1L), age = c(24L, 26L,
40L, NA, 42L, 29L), total_programs_enrolled = c(5L, 2L, 1L,
4L, 2L, 4L), is_handicapped = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("N", "Y"), class = "factor"), trainee_engagement_rating = c(1L,
3L, 2L, 1L, 4L, 2L), is_pass = c(0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("program_id",
"program_type", "program_duration", "test_id", "test_type", "difficulty_level",
"trainee_id", "gender", "education", "city_tier", "age", "total_programs_enrolled",
"is_handicapped", "trainee_engagement_rating", "is_pass"), row.names = c(NA,
6L), class = "data.frame")