Я хочу реализовать уменьшение размерности с помощью анализа компонентов соседства в классификации с использованием k Ближайшего соседства https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html но я получил ошибку, как я могу это исправить? в значении X_train есть строка, как конвертировать в float? я должен соответствовать и преобразовывать вручную? ValueError: не удалось преобразовать строку в число с плавающей точкой: '分散 シ ス テ ム: 原理 と パ ラ ダ イ ム'
код код
!wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
!tar zxf book_category.tgz
topics = [
'computer_graphics',
'operating_systems',
'computer_security',
'application_service',
'computer_software',
'artificial_intelligence',
'search_engine',
'information_society',
]
import glob
import re
import MeCab
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
docs = []
for topic in topics:
for f in glob.glob(f"./book_category/{topic}/*.txt"):
with open(f, "r") as fin:
body = "\n".join([line.strip() for line in fin if line.strip()])
docs.append((topic,body))
df = pd.DataFrame(
docs,
columns=["topic","body"],
dtype="category"
)
df.head()
df.topic.value_counts()
tagger = MeCab.Tagger("-Owakati")
def parse_to_wakati(text):
return tagger.parse(text).strip()
df = df.assign(body_wakati=df.body.apply(parse_to_wakati))
le = LabelEncoder()
y = le.fit_transform(df.topic)
print(le.classes_)
X_train, X_test, y_train, y_test = train_test_split(
df.body_wakati, #input
y, #right label
test_size=0.2,
random_state=10,
shuffle=True
)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix
class RulebasedEstimator(BaseEstimator, TransformerMixin):
def __init__(self, label_encoder):
self.le = label_encoder
def fit(self, X, y):
return self
def predict(self, X):
"""ルールを記述"""
result = []
for text in X:
pred = 0
if re.search(r"(コンピュータ|グラフィックス)", text):
pred = self.le.transform(["computer_graphics"])[0]
elif re.search(r"(オペレーティング|システム)", text):
pred = self.le.transform(["operating_systems"])[0]
elif re.search(r"(暗号|セキュリティ)", text):
pred = self.le.transform(["computer_security"])[0]
elif re.search(r"(アプリケーション|サービス)", text):
pred = self.le.transform(["application_service"])[0]
elif re.search(r"(コンピュータ|ソフトウェア)", text):
pred = self.le.transform(["computer_software"])[0]
elif re.search(r"(人工|知能)", text):
pred = self.le.transform(["artificial_intelligence"])[0]
elif re.search(r"(検索|エンジン)", text):
pred = self.le.transform(["search_engine"])[0]
elif re.search(r"(情報|社会)", text):
pred = self.le.transform(["information_society"])[0]
result.append(pred)
return result
rulebased = RulebasedEstimator(label_encoder=le)
rulebased_pred = rulebased.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,rulebased_pred)
print(classification_report(y_test, rulebased_pred, target_names=le.classes_))
#Import knearest neighbors Classifier model
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
random_state = 0
nca = make_pipeline(StandardScaler(),NeighborhoodComponentsAnalysis(n_components=2,random_state=random_state))
neigh=30
print("KNN_neighbour="+str(neigh)+"\n")
pca = make_pipeline(StandardScaler(),PCA(n_components=2, random_state=random_state))
# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
PCA(n_components=2, random_state=random_state))
# Reduce dimension to 2 with LinearDiscriminantAnalysis
lda = make_pipeline(StandardScaler(),
LinearDiscriminantAnalysis(n_components=2))
knn = Pipeline([
("tfidf", TfidfVectorizer()),
('nca',nca),
("clf", KNeighborsClassifier(n_neighbors=30,algorithm='kd_tree')),
])
#Train the model using the training sets
pca.fit(X_train, y_train)
knn.fit(model.transform(X_train), y_train)
#Predict the response for test dataset
y_pred = knn.predict(X_test)
error = []
# Calculating error for K values between 1 and 10
for i in range(1,40):
knn = Pipeline([
("tfidf", TfidfVectorizer()),
("clf", KNeighborsClassifier(n_neighbors=i)),
])
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
я получил это значение ошибки
ValueError Traceback (последний вызов был последним) в () 145]) 146 # Обучение модели с использованием обучающих наборов -> 147 pca.fit (X_train, y_train) 148 149 knn.fit (model.transform (X_train), y_train )
12 кадров /usr/local/lib/python3.6/dist-packages/numpy/core/_asarray.py в asarray (a, dtype, order) 83 84 "" "--- > 85 возвращаемый массив (a, dtype, copy = False, order = order) 86 87
ValueError: не удалось преобразовать строку в число с плавающей точкой: '分散 シ ス テ ム: 原理 と パ ラ ダ ダ ム'
редактировать:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf)
print(X_train_tf.shape)
################################
change the pipeline for PCA
pca = Pipeline([
("tfidf", TfidfVectorizer()),
("clf", PCA(n_components=2, random_state=random_state)),
])
#Train the model using the training sets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train) #i got error X_train is string not float
pca.fit(X_train_tf)
X_train=pca.transform(X_train_tf)
knn.fit(pca.transform(X_train_tf),y_train)