Запуск анализа текстовых настроений на испанском и английском языках, который изначально работал, но теперь вызывает проблемы с ASCII.
Я снова скачал Anaconda и попытался запустить свой код на python 2.7.Посмотрел другие ответы, которые предлагали добавить вещи сверху.Включая такие вещи, как: # - - кодировка: utf-8 - - # - - кодировка: utf-8 - - # кодировка: латинский-1 # кодировка =
# -*- coding: utf-8 -*-
# -*- encoding: utf-8 -*-
# coding: latin-1
# coding=<encoding name>
"""
Created on Tue Sep 04 09:34:17 2018
@author: skhan
"""
import string;
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import io
import csv
def strip (text):
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
terms_stop = [term for term in text if term not in stop]
return terms_stop
def removeNonAscii(s):
return "".join(i for i in s if ord(i)<128)
def flattern(A):
rt = []
for i in A:
if isinstance(i,list): rt.extend(flattern(i))
else: rt.append(i)
return rt
def lowercase (splitted): ##Take a text and lowercase everything
lc=[];
for word in splitted:
lc.append(word.lower())
return lc
def strip_punctuation(s):
return ''.join(c for c in s if c not in punctuation)
def remove(text,symbol):
filter(lambda x:x[0]!='#', text.split())
s=" ".join(filter(lambda x:x[0]!=symbol, text.split()))
return s
def desymbolize(text,symbol):
filter(lambda x:x[0]!='#', text.split())
s=" ".join(filter(lambda x:x[0]!=symbol, text.split()))
return s
def noun_strip(text):
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
i=0;
words=[];
while i<len(tagged):
if tagged[i][1]!='NNP':
words.append(tokens[i])
i=i+1
return " ".join(words)
stemmer = PorterStemmer();
def cleansplit (tweet):
TAG_RE = re.compile(r'<[^>]+>')
AT_RE = re.compile(r'@[A-Za-z0-9]+')
stemmer = PorterStemmer();
html=TAG_RE.sub('', tweet)
at_sign=AT_RE.sub('',html)
linked=re.sub(r"http\S+", "", at_sign)
splitted=nltk.word_tokenize(linked)
splitted=linked.split(); #######
hashed=hashing(splitted);
lowered=lowercase(hashed)
stems=[stemmer.stem(word) for word in lowered]
stripped=strip(stems)
result=" ".join(stripped)
return result
def clean_two (tweet):
#HASH_RE = re.compile(r'#')
UNI_RE = re.compile(r'\\u[A-Za-z0-9]+')
tweet=UNI_RE.sub(' ',line[3])
tweet=tweet.replace('\\n',' ')
tweet=removeNonAscii(tweet)
tweet=remove(tweet,'@'); ####DEBATE REMOVING @
TAG_RE = re.compile(r'<[^>]+>')
tweet=TAG_RE.sub('',tweet)
#tweet=HASH_RE.sub('',tweet)
#tweet=remove(tweet,'#')
linked=re.sub(r"http\S+", "", tweet)
splitted=nltk.word_tokenize(linked)
lowered=lowercase(splitted)
stems=[stemmer.stem(word) for word in lowered]
stripped=strip(stems)
result=" ".join(stripped)
# result=noun_strip(result) ############## location seems best at the end but be cautious for CS 412------------------??????????
return result
def clean (tweet):
TAG_RE = re.compile(r'<[^>]+>')
AT_RE = re.compile(r'@[A-Za-z0-9]+')
HASH_RE = re.compile(r'#[A-Za-z0-9]+') #hashtag completely removed
stemmer = PorterStemmer();
html=TAG_RE.sub('', tweet)
at_sign=AT_RE.sub('',html)
hashed=HASH_RE.sub('',at_sign)
linked=re.sub(r"http\S+", "", hashed)
splitted=nltk.word_tokenize(linked)
lowered=lowercase(splitted)
stems=[stemmer.stem(word) for word in lowered]
stripped=strip(stems)
result=" ".join(stripped)
return result
def clear (tweet):
#HASH_RE = re.compile(r'#')
UNI_RE = re.compile(r'\\u[A-Za-z0-9][A-Za-z0-9][A-Za-z0-9][A-Za-z0-9]')
tweet=UNI_RE.sub(' ',line[3])
tweet=tweet.replace('RT ',' ')
tweet=tweet.replace('"',' ')
tweet=tweet.replace('\\n',' ')
tweet=tweet.replace('\\',' ')
tweet=removeNonAscii(tweet)
####DEBATE REMOVING @
TAG_RE = re.compile(r'<[^>]+>')
tweet=TAG_RE.sub('',tweet)
tweet=re.sub(r"http\S+", "", tweet)
tweet=re.sub(r"&", "and", tweet)
tweet=re.sub("http", "", tweet)
tweet=tweet.replace('/',' ')
# result=noun_strip(result) ############## location seems best at the end but be cautious for CS 412------------------??????????
return tweet
def strip_control_characters(input):
if input:
import re
# unicode invalid characters
RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
u'|' + \
u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
(chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
)
input = re.sub(RE_XML_ILLEGAL, "", input)
# ascii control characters
input = re.sub(r"[\x01-\x1F\x7F]", "", input)
return input
from langdetect import detect
from classifier import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
clf = SentimentClassifier()
sentiments=['5']
state="Ohio"
def Spanish_Sent(p):
if p<=0.5:
sent=-1
else:
sent=1
return sent
def English_Sent(p):
sent=1;
if p['neg']>=p['pos']:
sent=-1
else:
sent=1
return sent
i=0;
with open("AdvisorData/SentimentAnalysis/2016_Tweets_"+state+".csv") as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for line in readCSV:
if i>1:
t=clear(line[3])
g=detect(t)
if g=='es':
p=clf.predict(t)
sent=Spanish_Sent(p)
sentiments.append(sent)
else:
p=analyser.polarity_scores(t)
sent=English_Sent(p)
sentiments.append(sent)
print(i)
i=i+1
import pandas as pd
df = pd.read_csv("AdvisorData/SentimentAnalysis/2016_Tweets_"+state+".csv")
df['Sentiments'] = sentiments
df.to_csv('AdvisorData/SentimentAnalysis/Labeled/2016_Tweets_'+state+'_Labeled.csv',index=None)
Он должен давать числовые значения настроений, а результаты добавляются в виде отдельного столбца кадра данных входных данных.Полученный новый фрейм данных сохраняется как CSV.Произошла следующая ошибка:
File "C:\Users\skhan\Anaconda2\lib\site-packages\classifier\sentimentPipeline.py", line 114
word = word.replace('í','i')
^
SyntaxError: Non-ASCII character '\xc3' in file C:\Users\skhan\Anaconda2\lib\site-packages\classifier\sentimentPipeline.py on line 114, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details