Ошибка выхода за границы из наивной байесовской функции - PullRequest
0 голосов
/ 03 мая 2020

Я создаю пользовательскую наивную байесовскую функцию, в которой я показываю точность, ошибочную классификацию и отзыв. Мне еще предстоит закодировать отзыв. Помощь очень ценится Ниже приведен мой код для наивной байесовской функции:

naive_bayes <- function(training.dataset, test.dataset){  
  ##read data from file to a data frame
  training.table <- trainSparse
  test.table <- testSparse

  ##retrieve class and features from training data
  training.class <- training.table[, 200]
  training.features <- training.table[,-200]
  remove(training.table)

  ##funciton for calculating priors
  calculate.priors <- function(class.vector){
    priors <- c()
    for (class in unique(class.vector)){
      priors <- rbind(priors, c(class, length(class.vector[class.vector==class])/length(class.vector)))
      colnames(priors) <- c("classification", "probability")
    }
    return (priors)
  }
  priors <- calculate.priors(training.class)

  ##Learn the features by calculating likelihood
  likelihood.list <- list()
  #calculate CPD by feature
  for (i in 1:dim(training.features)[2]){
    feature.values <- training.features[, i]
    unique.feature.values <- unique(feature.values)
    likelihood.matrix <- matrix(rep(NA), nrow=dim(priors)[1], ncol=length(unique.feature.values))
    colnames(likelihood.matrix) <- unique.feature.values
    rownames(likelihood.matrix) <- priors[, "classification"]
    for (item in unique.feature.values){
      likelihood.item <- vector()
      for (class in priors[, "classification"]){
        feature.value.inclass <- feature.values[training.class==class]
        likelihood.value <- length(feature.value.inclass[feature.value.inclass==item])/length(feature.value.inclass)
        likelihood.item <- c(likelihood.item, likelihood.value)
      }
      likelihood.matrix[, item] <- likelihood.item
    }
    likelihood.list[[i]] <- likelihood.matrix
  }


  ##Predict class for the test dataset
  #retrieve the features and target class of the testing dataset
  test.features <- test.table[, -200]
  test.target.class <- test.table[, 200]
  test.predict.class <- rep(NA, length(test.target.class))
  remove(test.table)

  #calculate posteriors for each test data record
  for (i in 1:dim(test.features)[1]){
    record <- test.features[i, ]
    posterior <- vector()
    #calculate posteriors for each possible class of that record
    for (class in priors[, "classification"]){
      #initialize posterior as the prior value of that class
      posterior.value <- as.numeric(priors[priors[, "classification"]==class, 2])
      likelihood.v <- c()
      for (item in 1:length(record)){
        likelihood.value <- likelihood.list[[item]][class, as.character(record[1, item])]
        likelihood.v <- c(likelihood.v, likelihood.value)
        posterior.value <- as.numeric(posterior.value) * as.numeric(likelihood.value)
      }
      posterior <- rbind(posterior, c(class, posterior.value)) 
    }
    predict.class <- posterior[posterior[,2]==max(as.numeric(posterior[,2])),1]
    test.predict.class[i] <- predict.class
  }
  accuracy <- length(test.predict.class[test.predict.class==test.target.class])/length(test.target.class)
  missclassification <- 1 - accuracy

  print(paste("Accuracy:", accuracy))
  print(paste("\nMissclassification:", missclassification))
}

Я не уверен, почему я получаю следующую ошибку:

Error in `[<-`(`*tmp*`, , item, value = likelihood.item) : 
  subscript out of bounds

Ниже выводится мой поезд набор данных:

> dput(head(trainSparse,5))
structure(list(also = c(0, 0, 0, 0, 0), anoth = c(1, 0, 1, 1, 
0), anyon = c(0, 0, 0, 0, 0), anyth = c(1, 0, 0, 0, 0), articl = c(0, 
0, 0, 0, 0), avail = c(0, 0, 0, 0, 0), back = c(0, 0, 0, 0, 0
), base = c(0, 0, 0, 1, 0), believ = c(2, 0, 0, 0, 0), best = c(0, 
0, 0, 0, 0), better = c(0, 0, 0, 0, 0), book = c(0, 0, 0, 0, 
0), can = c(4, 0, 0, 0, 0), case = c(1, 0, 0, 0, 0), consid = c(0, 
0, 0, 0, 0), day = c(0, 0, 0, 0, 0), differ = c(0, 0, 0, 0, 0
), distribut = c(0, 0, 0, 0, 0), drive = c(0, 0, 0, 0, 0), etc = c(0, 
0, 0, 0, 0), even = c(3, 0, 0, 0, 0), exampl = c(0, 0, 0, 2, 
0), exist = c(0, 0, 0, 0, 0), file = c(0, 0, 0, 0, 0), first = c(1, 
0, 0, 0, 0), follow = c(2, 0, 0, 0, 0), found = c(0, 0, 0, 0, 
0), get = c(0, 0, 0, 0, 0), god = c(3, 0, 0, 0, 0), great = c(0, 
0, 0, 0, 0), group = c(0, 0, 0, 0, 0), help = c(0, 0, 0, 0, 0
), high = c(0, 0, 0, 0, 0), howev = c(0, 0, 0, 1, 0), idea = c(1, 
0, 0, 0, 0), includ = c(0, 0, 0, 0, 0), inform = c(0, 0, 0, 0, 
0), john = c(0, 0, 0, 0, 0), keyword = c(0, 0, 0, 0, 0), kind = c(0, 
0, 0, 0, 0), know = c(1, 0, 0, 0, 0), like = c(0, 0, 0, 0, 0), 
    line = c(1, 1, 1, 3, 1), list = c(0, 0, 0, 0, 0), live = c(0, 
    0, 0, 0, 0), look = c(0, 0, 0, 0, 0), make = c(2, 0, 0, 0, 
    0), mani = c(0, 0, 0, 0, 0), may = c(0, 0, 0, 0, 0), must = c(0, 
    0, 0, 1, 0), nation = c(0, 0, 0, 0, 0), new = c(0, 0, 0, 
    0, 0), number = c(0, 0, 0, 0, 0), old = c(0, 0, 0, 0, 0), 
    one = c(1, 0, 0, 0, 0), opinion = c(0, 0, 0, 0, 0), organ = c(1, 
    1, 1, 1, 1), origin = c(0, 0, 0, 0, 0), peopl = c(1, 0, 2, 
    0, 0), person = c(0, 0, 0, 0, 0), place = c(2, 0, 0, 0, 0
    ), possibl = c(0, 0, 0, 0, 0), post = c(0, 0, 0, 0, 0), probabl = c(0, 
    0, 0, 0, 0), quit = c(0, 0, 0, 0, 0), rather = c(0, 0, 0, 
    0, 0), read = c(0, 0, 0, 0, 0), reason = c(1, 0, 0, 1, 0), 
    right = c(0, 0, 0, 0, 0), say = c(0, 0, 0, 0, 0), second = c(0, 
    0, 0, 0, 0), see = c(0, 0, 0, 1, 0), seem = c(0, 0, 0, 0, 
    0), send = c(0, 0, 0, 0, 0), set = c(0, 0, 0, 0, 0), state = c(0, 
    0, 0, 1, 0), subject = c(1, 1, 1, 1, 1), sure = c(0, 0, 0, 
    0, 0), system = c(0, 0, 0, 6, 0), take = c(2, 0, 0, 0, 0), 
    technolog = c(0, 1, 0, 1, 1), think = c(0, 0, 0, 1, 0), thought = c(0, 
    0, 0, 0, 0), time = c(1, 0, 0, 0, 0), tri = c(0, 0, 0, 0, 
    0), true = c(0, 0, 1, 0, 0), univers = c(1, 0, 1, 0, 0), 
    usa = c(0, 0, 0, 0, 0), use = c(1, 0, 2, 0, 0), version = c(0, 
    0, 0, 0, 0), way = c(0, 0, 0, 0, 0), well = c(1, 0, 0, 3, 
    0), will = c(4, 0, 0, 0, 0), without = c(0, 0, 0, 0, 0), 
    word = c(0, 0, 0, 0, 0), work = c(0, 0, 0, 0, 0), world = c(0, 
    0, 0, 0, 0), write = c(0, 0, 0, 0, 0), wrote = c(0, 0, 0, 
    0, 0), abl = c(0, 0, 0, 0, 0), actual = c(0, 0, 0, 0, 0), 
    allow = c(1, 0, 0, 0, 0), alway = c(0, 0, 0, 0, 0), answer = c(0, 
    0, 0, 0, 0), around = c(0, 0, 0, 0, 0), ask = c(0, 0, 0, 
    0, 0), call = c(0, 0, 0, 0, 0), cant = c(1, 0, 0, 1, 0), 
    care = c(0, 0, 0, 0, 0), caus = c(1, 0, 1, 0, 0), certain = c(0, 
    0, 0, 1, 0), chang = c(0, 0, 1, 0, 1), claim = c(1, 0, 0, 
    0, 0), control = c(0, 0, 0, 0, 0), cours = c(1, 0, 0, 0, 
    0), doesnt = c(0, 0, 0, 0, 0), done = c(0, 0, 1, 0, 0), dont = c(1, 
    0, 0, 0, 0), either = c(0, 0, 0, 0, 0), els = c(0, 0, 0, 
    0, 0), end = c(0, 0, 0, 1, 0), enough = c(0, 0, 0, 0, 0), 
    ever = c(0, 0, 0, 0, 0), everi = c(0, 0, 0, 1, 0), fact = c(1, 
    0, 0, 0, 0), far = c(0, 0, 0, 0, 0), find = c(0, 0, 0, 0, 
    0), game = c(0, 0, 0, 0, 0), general = c(0, 0, 0, 0, 0), 
    give = c(2, 0, 0, 0, 0), good = c(3, 0, 0, 1, 0), govern = c(0, 
    0, 0, 0, 0), happen = c(1, 0, 0, 0, 0), hard = c(0, 0, 0, 
    0, 0), hope = c(0, 0, 0, 0, 0), institut = c(0, 1, 0, 1, 
    1), isnt = c(0, 0, 0, 0, 0), just = c(0, 1, 0, 0, 0), keep = c(0, 
    0, 0, 0, 0), law = c(0, 0, 0, 0, 0), least = c(0, 0, 0, 0, 
    0), let = c(1, 0, 0, 0, 0), littl = c(0, 0, 0, 0, 0), long = c(2, 
    0, 0, 0, 0), made = c(0, 0, 1, 0, 0), mayb = c(0, 0, 0, 0, 
    0), mean = c(0, 0, 0, 0, 0), messag = c(0, 0, 0, 0, 0), might = c(0, 
    0, 0, 0, 0), much = c(0, 0, 0, 0, 0), need = c(0, 0, 0, 0, 
    0), note = c(0, 0, 0, 0, 0), noth = c(1, 0, 0, 0, 0), now = c(0, 
    0, 0, 0, 1), order = c(0, 0, 0, 0, 0), other = c(0, 0, 2, 
    0, 0), part = c(0, 0, 0, 0, 0), play = c(0, 0, 0, 0, 0), 
    pleas = c(0, 0, 0, 0, 0), point = c(0, 0, 0, 0, 0), power = c(1, 
    0, 0, 0, 0), problem = c(0, 0, 0, 0, 0), public = c(0, 0, 
    0, 0, 0), put = c(0, 0, 0, 0, 0), question = c(0, 0, 0, 0, 
    0), real = c(0, 0, 0, 0, 0), realli = c(0, 0, 0, 0, 0), requir = c(0, 
    1, 0, 0, 0), respons = c(0, 0, 0, 0, 0), run = c(0, 0, 0, 
    0, 0), said = c(1, 0, 0, 0, 0), scienc = c(0, 0, 0, 0, 0), 
    seen = c(0, 0, 0, 0, 0), sever = c(0, 0, 0, 0, 0), show = c(1, 
    0, 0, 0, 0), sinc = c(0, 1, 0, 0, 0), someon = c(0, 0, 0, 
    0, 0), someth = c(1, 0, 0, 0, 0), start = c(0, 0, 0, 0, 0
    ), still = c(0, 0, 0, 0, 0), support = c(0, 0, 0, 0, 0), 
    talk = c(0, 0, 0, 0, 1), tell = c(0, 0, 0, 0, 0), thank = c(0, 
    0, 0, 0, 0), that = c(0, 0, 0, 0, 0), thing = c(0, 0, 0, 
    0, 0), though = c(0, 0, 0, 0, 0), two = c(0, 0, 0, 0, 0), 
    want = c(1, 0, 0, 0, 0), year = c(0, 0, 0, 0, 1), yes = c(0, 
    0, 0, 1, 0), bad = c(0, 0, 2, 0, 0), xnewsread = c(0, 0, 
    0, 0, 0), research = c(0, 0, 0, 0, 0), interest = c(1, 0, 
    0, 0, 0), lot = c(1, 0, 0, 0, 0), didnt = c(0, 1, 0, 0, 0
    ), nntppostinghost = c(0, 1, 0, 1, 1), name = c(0, 0, 2, 
    0, 0), ive = c(0, 0, 0, 0, 0), never = c(0, 0, 0, 0, 0), 
    inc = c(0, 0, 0, 0, 0), comput = c(0, 0, 0, 0, 0), replyto = c(0, 
    0, 0, 0, 0), email = c(0, 0, 0, 0, 0), program = c(0, 0, 
    0, 0, 0), servic = c(0, 0, 0, 0, 0), bit = c(0, 0, 0, 0, 
    0), dept = c(0, 0, 0, 0, 0), come = c(0, 0, 0, 0, 0), access = c(0, 
    0, 0, 0, 0), got = c(0, 0, 0, 0, 0), yet = c(0, 0, 0, 0, 
    0), suggest = c(0, 0, 0, 0, 0), engin = c(0, 0, 0, 0, 0), 
    last = c(0, 0, 0, 0, 0), your = c(0, 0, 0, 0, 0), next. = c(0, 
    0, 0, 0, 0), david = c(0, 0, 0, 0, 0), internet = c(0, 0, 
    0, 0, 0), depart = c(0, 0, 0, 0, 0), softwar = c(0, 0, 0, 
    0, 0), center = c(0, 0, 0, 0, 0), window = c(0, 0, 0, 0, 
    0), Negative = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("FALSE", 
    "TRUE"), class = "factor")), row.names = c("6", "7", "8", 
"9", "11"), class = "data.frame") 

Данные тестирования соответствуют той же схеме!

1 Ответ

0 голосов
/ 04 мая 2020

Хорошо. Это задача отладки, но я думаю, что нашел ошибку. Ваши определения

unique.feature.values <- unique(feature.values)
likelihood.matrix     <- matrix(rep(NA), 
                                nrow=dim(priors)[1], 
                                ncol=length(unique.feature.values))

Позвольте

> unique.feature.values
[1] 4 0

и

> length(unique.feature.values)
[1] 2

В этом случае ваша матрица likelihood.matrix имеет

> dim(likelihood.matrix)
[1] 1 2

Мы игнорируем dim(priors)[1] часть в этой точке и предполагаем, что она 1. Ваш второй для -l oop определяется как

for (item in unique.feature.values){
      [ some code ]
      likelihood.matrix[, item] <- likelihood.item
    }

, и это точка, где он ломается. item является элементом unique.feature.values. Таким образом, как предполагается выше, первый элемент unique.feature.values равен 4. Но likelihood.matrix имеет размеры 1x2, поэтому

likelihood.matrix[,item] <- likelihood.item

приводит к

likelihood.matrix[,4] <- likelihood.item

, что выходит за пределы. Либо вы должны определить likelihood.matrix с помощью nrow = max(unique.feature.values), либо вы должны изменить свой второй для -l oop на

for (item in 1:length(unique.feature.values)) { }
...