Это модель ML, которая определяет, злоупотребляет ли владелец своей собакой по ключевым словам.
В частности, я не понимаю, как рассчитываются вероятности использования оскорбительных слов или что вообще происходит (I знаю этот звук patheti c)
Наконец, я не понимаю причину функции мощности в линиях. Я понял, что вероятность рассчитывается и должна следовать формуле
P(Abuse=yes|X) = P(X|Abuse=yes)P(Abuse=yes)
, но почему присутствует степенная функция?
p1 = prod(power(p1Vec, vecClassify)) * pAbusivve
p0 = prod(power(p0Vec, vecClassify)) * (1.0 - pAbusive)
Это код всей программы
from numpy import *
#creates a list of all the words
def createVocabList(dataSet):
vocabSet = set([]) # create empty set
for document in dataSet:
vocabSet = vocabSet | set(document)
# union of the two sets
return list(vocabSet)
# creates a vector
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("word: %s is not in my Vocabulary!" % word)
return returnVec
#loads the data
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea',
'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him',
'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so',
'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid',
'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak',
'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless',
'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
return postingList, classVec
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) #6
numWords = len(trainMatrix[0]) #32
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = zeros(numWords) # as numerator [0, 0, 0....]
p1Num = zeros(numWords) # as numerator
p0Denom = 0
p1Denom = 0
#for range(6), if the Category is 1 or 0
#perform array addition on the Row in train Matrix to the class numerator and its sum to the denominator
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#divide the numerator list by the denominator for p1 and p0
p1Vect = p1Num / p1Denom
p0Vect = p0Num / p0Denom
return p0Vect, p1Vect, pAbusive
def classifyNB0(vecClassify, p0Vec, p1Vec, pAbusive):
p1 = prod(power(p1Vec, vecClassify)) * pAbusive
print("vecClassify\n", vecClassify)
print("p1Vec\n",p1Vec)
print("power(p1Vec, vecClassify)\n", power(p1Vec, vecClassify))
print("prod(power(p1Vec, vecClassify))\n", prod(power(p1Vec, vecClassify)))
# element-wise power computation
p0 = prod(power(p0Vec, vecClassify)) * (1.0 - pAbusive)
if p1 > p0:
return 1
else:
return 0
#Program starts here
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
# classifying: case 1
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB0(thisDoc, p0V, p1V, pAb)) # out: 0
# classifying: case 2
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as: ', classifyNB0(thisDoc, p0V, p1V, pAb)) # out: 1