Я работаю над анализом настроений на хинди, который примет файл с новостными статьями и определит, являются ли они положительными или отрицательными в отношении предмета. Я использую НЛП и наивный байесовский классификатор.
Мой код занимает слишком много времени для проверки даже одного слова
pos_words = codecs.open("pos_train.txt",'r','utf-8') # positive datset
# negative dataset
neg_words = codecs.open("neg_train.txt",'r','utf-8')
# merging the two list in one big training tuple and filtering two letters words
sentiment_word = []
sentiment = "positive"
for words in pos_words.readlines():
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
sentiment_word.append((words_filtered, sentiment))
with codecs.open('positive_token.txt','w','utf-8') as output:
output.write(str(sentiment_word))
sentiment = "negative"
for words in neg_words.readlines():
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
sentiment_word.append((words_filtered, sentiment))
with codecs.open('negative_token.txt','w','utf-8') as output:
output.write(str(sentiment_word))
# getting word frequencies from the training data
def get_words_in_sentiment(sentiment_word):
all_words = []
for (words, sentiment) in sentiment_word:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_sentiment(sentiment_word))
# building a feature extractor
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
# building the training set
training_set = nltk.classify.apply_features(extract_features, sentiment_word)
# training the classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
def fn1(sentiment_word):
#print sentiment_word.split()
return classifier.classify(extract_features(sentiment_word.split()))
text_message = 'बिहार निवासी 45 वर्षीय राकेश कुमार की घटनास्थल पर ही मृत्यु हो गई'
#text_message2 = 'मलाइका की निजी जानकारी लीक कर रहा ड्राइवर? नौकरी से निकाला'
print_s = fn1(text_message)
#prinst_f = fn1(text_message2)
print(print_s)
#print(prinst_f)
Можете ли вы предложить мне лучший метод или модуль для развития анализа настроений хинди.