Мне нужно вычислить Precesion, F-measure и Recall для Naive и Svm с классификацией настроений. он возвращает ошибку me, поскольку строковый объект не имеет атрибута copy.
В коде preprocessedTrainingSet выдает обработанные данные обучения, а preprocessedTestSet выдает обработанный тестовый набор данных
word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures=nltk.classify.apply_features(extract_features,preprocessedTrainingSet)
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
accuracy = nltk.classify.util.accuracy(NBayesClassifier, preprocessedTestSet) #this returns error
Я выкладываю весь свой код здесь:
import csv
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import nltk
import sys
import os
nltk.download('punkt')
import csv
import datetime
from bs4 import BeautifulSoup
import re
import itertools
import emoji
def load_dict_smileys():
return {
":‑)":"smiley",
":-]":"smiley",
}
def load_dict_contractions():
return {
"ain't":"is not",
"amn't":"am not",
}
def strip_accents(text):
if 'ø' in text or 'Ø' in text:
#Do nothing when finding ø
return text
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
def buildTestSet():
Test_data = []
for line in open('Avengers.csv','r'):
cells = line.split( "," )
Test_data.append(cells[1])
return Test_data
testData = buildTestSet()
def buildTrainingSet(corpusFile):
corpus = []
trainingDataSet = []
with open(corpusFile, "rt", encoding="utf8") as csvFile:
lineReader = csv.reader(csvFile,delimiter=',', quotechar="\"")
for row in lineReader:
trainingDataSet.append(row)
return trainingDataSet
corpusFile = "trainingSet.csv"
trainingData = buildTrainingSet(corpusFile)
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
if testD == 1:
#print(tweet)
processedTweets.append((self._processTweet(tweet),tweet[3]))
else:
processedTweets.append((self._processTweet(tweet[2]),tweet[3]))
return processedTweets
def _processTweet(self, tweet):
tweet = BeautifulSoup(tweet).get_text()
tweet = tweet.replace('\x92',"'")
tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
#Lower case
tweet = tweet.lower()
CONTRACTIONS = load_dict_contractions()
tweet = tweet.replace("’","'")
words = tweet.split()
reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
tweet = " ".join(reformed)
tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
SMILEY = load_dict_smileys()
words = tweet.split()
reformed = [SMILEY[word] if word in SMILEY else word for word in words]
tweet = " ".join(reformed)
#Deal with emojis
tweet = emoji.demojize(tweet)
#Strip accents
tweet= strip_accents(tweet)
tweet = tweet.replace(":"," ")
tweet = ' '.join(tweet.split())
return tweet
testD = 0
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
testD = 1
preprocessedTestSet = tweetProcessor.processTweets(testData)
def buildVocabulary(preprocessedTrainingData):
all_words = []
for (words, sentiment) in preprocessedTrainingData:
all_words.extend(words)
wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()
return word_features
def extract_features(tweet):
tweet_words=set(tweet)
features={}
for word in word_features:
features['contains(%s)' % word]=(word in tweet_words)
return features
trainingFeatures=nltk.classify.apply_features(extract_features,preprocessedTrainingSet)
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
NBResultLabels = [NBayesClassifier.classify(extract_features(tweet[0])) for tweet in preprocessedTestSet]
if NBResultLabels.count('positive') > NBResultLabels.count('negative'):
print("Overall Positive Sentiment")
print("Positive Sentiment Percentage = " + str(100*NBResultLabels.count('positive')/len(NBResultLabels)) + "%")
else:
print("Overall Negative Sentiment")
print("Negative Sentiment Percentage = " + str(100*NBResultLabels.count('negative')/len(NBResultLabels)) + "%")
accuracy = nltk.classify.util.accuracy(NBayesClassifier, preprocessedTestSet)
print(accuracy*100)
результат должен прийти таким образом
precision recall f1-score support
0 0.65 1.00 0.79 17
1 0.57 0.75 0.65 16
2 0.33 0.06 0.10 17
avg / total 0.52 0.60 0.51 50