Как вычислить F-меру, Precesion, Recall для Naive и Svm Nltk, Erro: строковый объект не имеет атрибута copy - PullRequest
2 голосов
/ 26 марта 2019

Мне нужно вычислить Precesion, F-measure и Recall для Naive и Svm с классификацией настроений. он возвращает ошибку me, поскольку строковый объект не имеет атрибута copy. В коде preprocessedTrainingSet выдает обработанные данные обучения, а preprocessedTestSet выдает обработанный тестовый набор данных

word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures=nltk.classify.apply_features(extract_features,preprocessedTrainingSet)

NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
accuracy = nltk.classify.util.accuracy(NBayesClassifier, preprocessedTestSet) #this returns error

Я выкладываю весь свой код здесь:

import csv
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import nltk 
import sys
import os
nltk.download('punkt')
import csv
import datetime
from bs4 import BeautifulSoup
import re
import itertools
import emoji

def load_dict_smileys():

    return {
        ":‑)":"smiley",
        ":-]":"smiley",

        }


def load_dict_contractions():

    return {
        "ain't":"is not",
        "amn't":"am not",

        }


def strip_accents(text):
    if 'ø' in text or  'Ø' in text:
        #Do nothing when finding ø 
        return text   
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)



def buildTestSet():
    Test_data = []
    for line in open('Avengers.csv','r'):
        cells = line.split( "," )
        Test_data.append(cells[1])

    return Test_data


testData = buildTestSet()


def buildTrainingSet(corpusFile):
    corpus = []
    trainingDataSet = []
    with open(corpusFile, "rt", encoding="utf8") as csvFile:
        lineReader = csv.reader(csvFile,delimiter=',', quotechar="\"")

        for row in lineReader:
            trainingDataSet.append(row)
    return trainingDataSet

corpusFile = "trainingSet.csv"
trainingData = buildTrainingSet(corpusFile)


class PreProcessTweets:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])

    def processTweets(self, list_of_tweets):
        processedTweets=[]
        for tweet in list_of_tweets:

            if testD == 1:
                #print(tweet)
                processedTweets.append((self._processTweet(tweet),tweet[3]))
            else:
                processedTweets.append((self._processTweet(tweet[2]),tweet[3]))

        return processedTweets

    def _processTweet(self, tweet):
        tweet = BeautifulSoup(tweet).get_text()
        tweet = tweet.replace('\x92',"'")
        tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
        tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
        tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
        #Lower case
        tweet = tweet.lower()
        CONTRACTIONS = load_dict_contractions()
        tweet = tweet.replace("’","'")
        words = tweet.split()
        reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
        tweet = " ".join(reformed)
        tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

        SMILEY = load_dict_smileys()  
        words = tweet.split()
        reformed = [SMILEY[word] if word in SMILEY else word for word in words]
        tweet = " ".join(reformed)
        #Deal with emojis
        tweet = emoji.demojize(tweet)
        #Strip accents
        tweet= strip_accents(tweet)
        tweet = tweet.replace(":"," ")
        tweet = ' '.join(tweet.split())

        return tweet

testD = 0
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
testD = 1
preprocessedTestSet = tweetProcessor.processTweets(testData)




def buildVocabulary(preprocessedTrainingData):
    all_words = []

    for (words, sentiment) in preprocessedTrainingData:
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()

    return word_features

def extract_features(tweet):
    tweet_words=set(tweet)
    features={}
    for word in word_features:
        features['contains(%s)' % word]=(word in tweet_words)
    return features 

trainingFeatures=nltk.classify.apply_features(extract_features,preprocessedTrainingSet)

NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)



NBResultLabels = [NBayesClassifier.classify(extract_features(tweet[0])) for tweet in preprocessedTestSet]

if NBResultLabels.count('positive') > NBResultLabels.count('negative'):
    print("Overall Positive Sentiment")
    print("Positive Sentiment Percentage = " + str(100*NBResultLabels.count('positive')/len(NBResultLabels)) + "%")
else: 
    print("Overall Negative Sentiment")
    print("Negative Sentiment Percentage = " + str(100*NBResultLabels.count('negative')/len(NBResultLabels)) + "%")

accuracy = nltk.classify.util.accuracy(NBayesClassifier, preprocessedTestSet)
print(accuracy*100)

результат должен прийти таким образом

             precision    recall  f1-score   support

          0       0.65      1.00      0.79        17
          1       0.57      0.75      0.65        16
          2       0.33      0.06      0.10        17
avg / total       0.52      0.60      0.51        50
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...