Не-ASCII символ '\ xc3' в sentimentPipeline.py - PullRequest
0 голосов
/ 24 декабря 2018

Запуск анализа текстовых настроений на испанском и английском языках, который изначально работал, но теперь вызывает проблемы с ASCII.

Я снова скачал Anaconda и попытался запустить свой код на python 2.7.Посмотрел другие ответы, которые предлагали добавить вещи сверху.Включая такие вещи, как: # - - кодировка: utf-8 - - # - - кодировка: utf-8 - - # кодировка: латинский-1 # кодировка =

       # -*- coding: utf-8 -*-
       # -*- encoding: utf-8 -*-
       # coding: latin-1
       # coding=<encoding name>



"""
Created on Tue Sep 04 09:34:17 2018

@author: skhan
"""
import string;
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import io
import csv

def strip (text):
    punctuation = list(string.punctuation)
    stop = stopwords.words('english') + punctuation + ['rt', 'via']
    terms_stop = [term for term in text if term not in stop]
    return terms_stop

def removeNonAscii(s): 
    return "".join(i for i in s if ord(i)<128)


def flattern(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flattern(i))
        else: rt.append(i)
    return rt

def lowercase (splitted):   ##Take a text and lowercase everything
    lc=[]; 
    for word in splitted:
        lc.append(word.lower())
    return lc

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def remove(text,symbol):
    filter(lambda x:x[0]!='#', text.split())
    s=" ".join(filter(lambda x:x[0]!=symbol, text.split()))
    return s

def desymbolize(text,symbol):
    filter(lambda x:x[0]!='#', text.split())
    s=" ".join(filter(lambda x:x[0]!=symbol, text.split()))
    return s
def noun_strip(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    i=0;
    words=[];
    while i<len(tagged):
        if tagged[i][1]!='NNP':
            words.append(tokens[i])
        i=i+1
    return " ".join(words)

stemmer = PorterStemmer(); 


def cleansplit (tweet):
    TAG_RE = re.compile(r'<[^>]+>')
    AT_RE = re.compile(r'@[A-Za-z0-9]+')
    stemmer = PorterStemmer();  
    html=TAG_RE.sub('', tweet)
    at_sign=AT_RE.sub('',html)
    linked=re.sub(r"http\S+", "", at_sign)
    splitted=nltk.word_tokenize(linked)
    splitted=linked.split();       #######
    hashed=hashing(splitted);
    lowered=lowercase(hashed)                  
    stems=[stemmer.stem(word) for word in lowered]
    stripped=strip(stems)
    result=" ".join(stripped)
    return result


def clean_two (tweet):
    #HASH_RE = re.compile(r'#')
    UNI_RE = re.compile(r'\\u[A-Za-z0-9]+')
    tweet=UNI_RE.sub(' ',line[3])

    tweet=tweet.replace('\\n',' ')
    tweet=removeNonAscii(tweet)
    tweet=remove(tweet,'@');               ####DEBATE REMOVING @
    TAG_RE = re.compile(r'<[^>]+>')
    tweet=TAG_RE.sub('',tweet)
    #tweet=HASH_RE.sub('',tweet)
    #tweet=remove(tweet,'#')       
    linked=re.sub(r"http\S+", "", tweet)    
    splitted=nltk.word_tokenize(linked)      
    lowered=lowercase(splitted)              
    stems=[stemmer.stem(word) for word in lowered]
    stripped=strip(stems)
    result=" ".join(stripped)
 #   result=noun_strip(result)               ############## location seems best at the end  but  be cautious for CS 412------------------??????????
    return result



def clean (tweet):
    TAG_RE = re.compile(r'<[^>]+>')
    AT_RE = re.compile(r'@[A-Za-z0-9]+')
    HASH_RE = re.compile(r'#[A-Za-z0-9]+')     #hashtag completely removed
    stemmer = PorterStemmer();  
    html=TAG_RE.sub('', tweet)
    at_sign=AT_RE.sub('',html)
    hashed=HASH_RE.sub('',at_sign)
    linked=re.sub(r"http\S+", "", hashed)
    splitted=nltk.word_tokenize(linked)
    lowered=lowercase(splitted)                  
    stems=[stemmer.stem(word) for word in lowered]
    stripped=strip(stems)
    result=" ".join(stripped)
    return result


def clear (tweet):
    #HASH_RE = re.compile(r'#')
    UNI_RE = re.compile(r'\\u[A-Za-z0-9][A-Za-z0-9][A-Za-z0-9][A-Za-z0-9]')
    tweet=UNI_RE.sub(' ',line[3])
    tweet=tweet.replace('RT ',' ')
    tweet=tweet.replace('"',' ')
    tweet=tweet.replace('\\n',' ')
    tweet=tweet.replace('\\',' ') 
    tweet=removeNonAscii(tweet)
                   ####DEBATE REMOVING @
    TAG_RE = re.compile(r'<[^>]+>')
    tweet=TAG_RE.sub('',tweet)
    tweet=re.sub(r"http\S+", "", tweet)    
    tweet=re.sub(r"&amp;", "and", tweet)
    tweet=re.sub("http", "", tweet)
    tweet=tweet.replace('/',' ')
 #   result=noun_strip(result)               ############## location seems best at the end  but  be cautious for CS 412------------------??????????
    return tweet

def strip_control_characters(input):

    if input:

        import re

        # unicode invalid characters
        RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
                         u'|' + \
                         u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
                          (chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
                           chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
                           chr(0xd800),chr(0xdbff),chr(0xdc00),chr(0xdfff),
                           )
        input = re.sub(RE_XML_ILLEGAL, "", input)

        # ascii control characters
        input = re.sub(r"[\x01-\x1F\x7F]", "", input)

    return input





from langdetect import detect
from classifier import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
clf = SentimentClassifier()
sentiments=['5']
state="Ohio"



def Spanish_Sent(p):
    if p<=0.5:
        sent=-1
    else:
        sent=1
    return sent


def English_Sent(p):
    sent=1;
    if p['neg']>=p['pos']:
        sent=-1
    else:
        sent=1
    return sent


i=0;
with open("AdvisorData/SentimentAnalysis/2016_Tweets_"+state+".csv") as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for line in readCSV:
        if i>1:
            t=clear(line[3])
            g=detect(t)
            if g=='es':
                p=clf.predict(t)
                sent=Spanish_Sent(p)
                sentiments.append(sent)
            else:
                p=analyser.polarity_scores(t)
                sent=English_Sent(p)
                sentiments.append(sent)


        print(i)
        i=i+1


import pandas as pd
df = pd.read_csv("AdvisorData/SentimentAnalysis/2016_Tweets_"+state+".csv")
df['Sentiments'] = sentiments
df.to_csv('AdvisorData/SentimentAnalysis/Labeled/2016_Tweets_'+state+'_Labeled.csv',index=None)

Он должен давать числовые значения настроений, а результаты добавляются в виде отдельного столбца кадра данных входных данных.Полученный новый фрейм данных сохраняется как CSV.Произошла следующая ошибка:

  File "C:\Users\skhan\Anaconda2\lib\site-packages\classifier\sentimentPipeline.py", line 114
    word = word.replace('í','i')
                                ^
SyntaxError: Non-ASCII character '\xc3' in file C:\Users\skhan\Anaconda2\lib\site-packages\classifier\sentimentPipeline.py on line 114, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...