Это даст вам положительный, нейтральный и отрицательный анализ настроений для каждой строки комментариев в поле кадра данных. Существует много кода для предварительной обработки, чтобы очистить вещи, отфильтровать стоп-слова, выполнить некоторые базовые c построения графиков и т. Д. c.
import pickle
import pandas as pd
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
global str
df = pd.read_csv('C:\\your_path\\test_dataset.csv')
# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df['body'] = df[['body']].astype(str)
df['review_text'] = df[['review_text']].astype(str)
df['body'] = df['body'].str.replace('\d+', '')
df['review_text'] = df['review_text'].str.replace('\d+', '')
# get rid of special characters
df['body'] = df['body'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['review_text'].str.replace(r'[^\w\s]+', '')
# get rid fo double spaces
df['body'] = df['body'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['review_text'].str.replace(r'\^[a-zA-Z]\s+', '')
# convert all case to lower
df['body'] = df['body'].str.lower()
df['review_text'] = df['review_text'].str.lower()
# It looks like the language in body and review_text is very similar (2 fields in dataframe). let's check how closely they match...
# seems like the tone is similar, but the text is not matching at a high rate...less than 20% match rate
import difflib
body_list = df['body'].tolist()
review_text_list = df['review_text'].tolist()
body = body_list
reviews = review_text_list
s = difflib.SequenceMatcher(None, body, reviews).ratio()
print ("ratio:", s, "\n")
# filter out stop words
# these are the most common words such as: “the“, “a“, and “is“.
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
text = str(body_list)
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
# plot most frequently occurring words in a bar chart
# remove unwanted characters, numbers and symbols
df['review_text'] = df['review_text'].str.replace("[^a-zA-Z#]", " ")
#Let’s try to remove the stopwords and short words (<2 letters) from the reviews.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(rev):
rev_new = " ".join([i for i in rev if i not in stop_words])
return rev_new
# remove short words (length < 3)
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in df['review_text']]
# make entire text lowercase
reviews = [r.lower() for r in reviews]
#Let’s again plot the most frequent words and see if the more significant words have come out.
freq_words(reviews, 35)
# Tf-idf is a very common technique for determining roughly what each document in a set of
# documents is “about”. It cleverly accomplishes this by looking at two simple metrics: tf
# (term frequency) and idf (inverse document frequency). Term frequency is the proportion
# of occurrences of a specific term to total number of terms in a document. Inverse document
# frequency is the inverse of the proportion of documents that contain that word/phrase.
# Simple, right!? The general idea is that if a specific phrase appears a lot of times in a
# given document, but it doesn’t appear in many other documents, then we have a good idea
# that the phrase is important in distinguishing that document from all the others.
# Starting with the CountVectorizer/TfidfTransformer approach...
# convert fields in datframe to list
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
# Calculate all the n-grams found in all documents
from itertools import islice
list(islice(cvec.vocabulary_.items(), 20))
# Let’s take a moment to describe these parameters as they are the primary levers for adjusting what
# feature set we end up with. First is “min_df” or mimimum document frequency. This sets the minimum
# number of documents that any term is contained in. This can either be an integer which sets the
# number specifically, or a decimal between 0 and 1 which is interpreted as a percentage of all documents.
# Next is “max_df” which similarly controls the maximum number of documents any term can be found in.
# If 90% of documents contain the word “spork” then it’s so common that it’s not very useful.
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.5, ngram_range=(1,2))
# Our next move is to transform the document into a “bag of words” representation which essentially is
# just a separate column for each term containing the count within each document. After that, we’ll
# take a look at the sparsity of this representation which lets us know how many nonzero values there
# are in the dataset. The more sparse the data is the more challenging it will be to model
cvec_counts = cvec.transform(body_list)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))
# get counts of frequently occurring terms; top 20
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)
# Now that we’ve got term counts for each document we can use the TfidfTransformer to calculate the
# weights for each term in each document
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
# we can take a look at the top 20 terms by average tf-idf weight.
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)
# Here we are doing some sentiment analysis, and distilling the 'review_text' field into positive, neutral, or negative,
# based on the tone of the text in each record. Also, we are filtering out the records that have <.2 negative score;
# keeping only those that have >.2 negative score. This is interesting, but this can contain some non-intitive results.
# For instance, one record in 'review_text' literally says 'no issues'. This is probably positive, but the algo sees the
# word 'no' and interprets the comment as negative. I would argue that it's positive. We'll circle back and resolve
# this potential issue a little later.
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
df['sentiment'] = df['review_text'].apply(lambda x: sid.polarity_scores(x))
def convert(x):
if x < 0:
return "negative"
elif x > .2:
return "positive"
return "neutral"
df['result'] = df['sentiment'].apply(lambda x:convert(x['compound']))
# df.groupby(['brand','result']).size()
# df.groupby(['brand','result']).count()
x = df.groupby(['review_text','brand'])['result'].value_counts(normalize=True)
x = df.groupby(['brand'])['result'].value_counts(normalize=True)
y = x.loc[(x.index.get_level_values(1) == 'negative')]
brand result
ABH negative 0.500000
Alexander McQueen negative 0.500000
Anastasia negative 0.498008
BURBERRY negative 0.248092
Beats negative 0.272947
Bowers & Wilkins negative 0.500000
Breitling Official negative 0.666667
Capri Blue negative 0.333333
FERRARI negative 1.000000
Fendi negative 0.283582
GIORGIO ARMANI negative 1.000000
Jan Marini Skin Research negative 0.250000
Jaybird negative 0.235294
LANC�ME negative 0.500000
Longchamp negative 0.271605
Longchamps negative 0.500000
M.A.C negative 0.203390
Meaningful Beauty negative 0.222222
Polk Audio negative 0.256410
Pumas negative 0.222222
Ralph Lauren Polo negative 0.500000
Roberto Cavalli negative 0.250000
Samsung negative 0.332298
T3 Micro negative 0.224138
Too Faced negative 0.216216
VALENTINO by Mario Valentino negative 0.333333
YSL negative 0.250000
Не стесняйтесь пропускать вещи, которые вы считаете неуместными, но на самом деле код выполняет довольно полный анализ НЛП.
Также взгляните на эти две ссылки.