Модель не может предсказать из-за ошибки типа, применяя модель к URFU-кодированному набору данных URF-8 - PullRequest
1 голос
/ 06 мая 2019

Я пытаюсь запустить алгоритм для набора данных URDU, у меня есть модель логистической регрессии для английского языка, но возникают ошибки, когда я пытаюсь использовать ее в наборе данных в кодировке UTF-8 на языке урду.

Iприменил приведенный ниже код к проблеме.

import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import logging
from gensim.models import word2vec
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import coo_matrix
from tqdm import tqdm
from scipy import sparse
import numpy 
#import score
import re, nltk, scipy

#import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold

dataset1 = pd.read_csv("fakenews.csv", encoding = 'UTF-8')
ds = dataset1.copy()
print(ds.shape)
df = ds.copy()

df = ds.copy()
df.drop('FileName',axis=1,inplace = True)
df.drop('label',axis=1,inplace = True)

da = ds.copy()
da.drop('FileName',axis=1,inplace = True)
da.drop('title',axis=1,inplace = True)
da.drop('text',axis=1,inplace = True)

def extract_word_overlap(title, text):
    word_overlap = []
    for i, (title, body) in tqdm(enumerate(zip(title, text))):
        #preprocess_headline = preprocess(headline)
        #preprocess_body = preprocess(body)
        features = len(set(title).intersection(text)) / float(len(set(title).union(text)))
        word_overlap.append(features)
        word_overlap_sparse = scipy.sparse.coo_matrix(numpy.array(word_overlap)) 

    return word_overlap_sparse

def combine_features(tfidf_vectors, word_overlap):
    combined_features =  sparse.bmat([[tfidf_vectors, word_overlap.T]])
    return combined_features

from sklearn.model_selection import train_test_split   
x_train, x_test,y_train,y_test = train_test_split(df,da, test_size = 0.3,random_state=42)

training_bodies = x_train['text']
training_headlines = x_train['title']

test_bodies = x_test['text']
test_headlines = x_test['title']

print("\t-Extracting tfidf vectors..")

body_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
bodies_tfidf = body_vectorizer.fit_transform(training_bodies)

headline_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
headlines_tfidf = headline_vectorizer.fit_transform(training_headlines)


bodies_tfidf_test = body_vectorizer.transform(test_bodies)
headlines_tfidf_test = headline_vectorizer.transform(test_headlines)

training_tfidf = scipy.sparse.hstack([bodies_tfidf, headlines_tfidf])
test_tfidf = scipy.sparse.hstack([bodies_tfidf_test, headlines_tfidf_test])



training_overlap = extract_word_overlap(training_headlines, training_bodies)
test_overlap = extract_word_overlap(test_headlines, test_bodies)
training_features = combine_features(training_tfidf,  training_overlap)    
test_features = combine_features(test_tfidf, test_overlap)



print("[3] Fitting model..")
print("\t-Logistic Regression")

lr = LogisticRegression(C = 1.0, class_weight='balanced', solver="lbfgs", max_iter=150) 


y_pred = lr.fit(training_features, y_train).predict(test_features)


from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score
print('classification report:')
print(classification_report(y_test, y_pred))
score = metrics.accuracy_score(y_test, y_pred)

print("Accuracy of Regression:   %0.3f" % score)

print('Macro f1:', f1_score(y_test, y_pred, average='macro'))
print("[4] Evaluating model..")
score.report_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
"""

Я ожидал, что модель будет предсказывать, но это даст мне ошибку типа.Ошибка типа: <<не поддерживается между экземплярами 'float' и 'str' </p>

761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Traceback (most recent call last):


  File "<ipython-input-1-800030f783cf>", line 1, in <module>
    runfile('C:/Users/door/Desktop/af/pycode.py', wdir='C:/Users/door/Desktop/af')

  File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 704, in runfile
    execfile(filename, namespace)

  File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "C:/Users/door/Desktop/af/pycode.py", line 101, in <module>
    y_pred = lr.fit(training_features, y_train).predict(test_features)

  File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py", line 1286, in fit
    check_classification_targets(y)


  File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 168, in check_classification_targets
    y_type = type_of_target(y)

  File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 287, in type_of_target
    if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):

  File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 233, in unique
    ret = _unique1d(ar, return_index, return_inverse, return_counts)

  File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 281, in _unique1d
    ar.sort()

TypeError: '<' not supported between instances of 'float' and 'str'

Набор данных, который я использую, выглядит следующим образом

FileName    title   text    label
0001a   میں استعفیٰ نہیں دے رہا   فیاض الحسن چوہان  "صوبائی وزیر اطلاعات فیاض الحسن چوہان نے کہا ہے کہ میں استعفیٰ نہیں دے رہا اور نہ مجھ سے استعفیٰ مانگا گیا ہے  مجھے محتاط بیان دینے کا کہا گیا ہے 
اور میں نے اپنی بیان میں ہندو مذہب یا ہندو برادری کو نہیں بلکہ بھارتی فوج اور بھارتی میڈیا کو مخاطب کیا تھا"    Agree
...