Я пытаюсь запустить алгоритм для набора данных URDU, у меня есть модель логистической регрессии для английского языка, но возникают ошибки, когда я пытаюсь использовать ее в наборе данных в кодировке UTF-8 на языке урду.
Iприменил приведенный ниже код к проблеме.
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import logging
from gensim.models import word2vec
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import coo_matrix
from tqdm import tqdm
from scipy import sparse
import numpy
#import score
import re, nltk, scipy
#import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold
dataset1 = pd.read_csv("fakenews.csv", encoding = 'UTF-8')
ds = dataset1.copy()
print(ds.shape)
df = ds.copy()
df = ds.copy()
df.drop('FileName',axis=1,inplace = True)
df.drop('label',axis=1,inplace = True)
da = ds.copy()
da.drop('FileName',axis=1,inplace = True)
da.drop('title',axis=1,inplace = True)
da.drop('text',axis=1,inplace = True)
def extract_word_overlap(title, text):
word_overlap = []
for i, (title, body) in tqdm(enumerate(zip(title, text))):
#preprocess_headline = preprocess(headline)
#preprocess_body = preprocess(body)
features = len(set(title).intersection(text)) / float(len(set(title).union(text)))
word_overlap.append(features)
word_overlap_sparse = scipy.sparse.coo_matrix(numpy.array(word_overlap))
return word_overlap_sparse
def combine_features(tfidf_vectors, word_overlap):
combined_features = sparse.bmat([[tfidf_vectors, word_overlap.T]])
return combined_features
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(df,da, test_size = 0.3,random_state=42)
training_bodies = x_train['text']
training_headlines = x_train['title']
test_bodies = x_test['text']
test_headlines = x_test['title']
print("\t-Extracting tfidf vectors..")
body_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
bodies_tfidf = body_vectorizer.fit_transform(training_bodies)
headline_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
headlines_tfidf = headline_vectorizer.fit_transform(training_headlines)
bodies_tfidf_test = body_vectorizer.transform(test_bodies)
headlines_tfidf_test = headline_vectorizer.transform(test_headlines)
training_tfidf = scipy.sparse.hstack([bodies_tfidf, headlines_tfidf])
test_tfidf = scipy.sparse.hstack([bodies_tfidf_test, headlines_tfidf_test])
training_overlap = extract_word_overlap(training_headlines, training_bodies)
test_overlap = extract_word_overlap(test_headlines, test_bodies)
training_features = combine_features(training_tfidf, training_overlap)
test_features = combine_features(test_tfidf, test_overlap)
print("[3] Fitting model..")
print("\t-Logistic Regression")
lr = LogisticRegression(C = 1.0, class_weight='balanced', solver="lbfgs", max_iter=150)
y_pred = lr.fit(training_features, y_train).predict(test_features)
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score
print('classification report:')
print(classification_report(y_test, y_pred))
score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy of Regression: %0.3f" % score)
print('Macro f1:', f1_score(y_test, y_pred, average='macro'))
print("[4] Evaluating model..")
score.report_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
"""
Я ожидал, что модель будет предсказывать, но это даст мне ошибку типа.Ошибка типа: <<не поддерживается между экземплярами 'float' и 'str' </p>
761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Traceback (most recent call last):
File "<ipython-input-1-800030f783cf>", line 1, in <module>
runfile('C:/Users/door/Desktop/af/pycode.py', wdir='C:/Users/door/Desktop/af')
File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 704, in runfile
execfile(filename, namespace)
File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/door/Desktop/af/pycode.py", line 101, in <module>
y_pred = lr.fit(training_features, y_train).predict(test_features)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py", line 1286, in fit
check_classification_targets(y)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 168, in check_classification_targets
y_type = type_of_target(y)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 287, in type_of_target
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 233, in unique
ret = _unique1d(ar, return_index, return_inverse, return_counts)
File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 281, in _unique1d
ar.sort()
TypeError: '<' not supported between instances of 'float' and 'str'
Набор данных, который я использую, выглядит следующим образом
FileName title text label
0001a میں استعفیٰ نہیں دے رہا فیاض الحسن چوہان "صوبائی وزیر اطلاعات فیاض الحسن چوہان نے کہا ہے کہ میں استعفیٰ نہیں دے رہا اور نہ مجھ سے استعفیٰ مانگا گیا ہے مجھے محتاط بیان دینے کا کہا گیا ہے
اور میں نے اپنی بیان میں ہندو مذہب یا ہندو برادری کو نہیں بلکہ بھارتی فوج اور بھارتی میڈیا کو مخاطب کیا تھا" Agree