внедрить PCA в python и k ближайших - PullRequest
0 голосов
/ 08 января 2020

Я хочу реализовать уменьшение размерности с помощью анализа компонентов соседства в классификации с использованием k Ближайшего соседства https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html но я получил ошибку, как я могу это исправить? в значении X_train есть строка, как конвертировать в float? я должен соответствовать и преобразовывать вручную? ValueError: не удалось преобразовать строку в число с плавающей точкой: '分散 シ ス テ ム: 原理 と パ ラ ダ イ ム'

код код

!wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
!tar zxf book_category.tgz


topics = [
'computer_graphics',
'operating_systems',
'computer_security',
'application_service',
'computer_software',
'artificial_intelligence',
'search_engine',
'information_society',
]

import glob
import re

import MeCab
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

docs = []
for topic in topics:
    for f in glob.glob(f"./book_category/{topic}/*.txt"):

        with open(f, "r") as fin:
            body = "\n".join([line.strip() for line in fin if line.strip()])
        docs.append((topic,body))

df = pd.DataFrame(
        docs,
        columns=["topic","body"],
        dtype="category"
)



df.head()


df.topic.value_counts()



tagger = MeCab.Tagger("-Owakati")

def parse_to_wakati(text):
    return tagger.parse(text).strip()

df = df.assign(body_wakati=df.body.apply(parse_to_wakati))


le = LabelEncoder()
y = le.fit_transform(df.topic)

print(le.classes_)


X_train, X_test, y_train, y_test = train_test_split(
                                      df.body_wakati,  #input
                                      y, #right label
                                      test_size=0.2, 
                                      random_state=10,
                                      shuffle=True
)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix
class RulebasedEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, label_encoder):
        self.le = label_encoder

    def fit(self, X, y):
        return self

    def predict(self, X):
        """ルールを記述"""
        result = []
        for text in X:
            pred = 0
            if re.search(r"(コンピュータ|グラフィックス)", text):
                pred = self.le.transform(["computer_graphics"])[0]
            elif re.search(r"(オペレーティング|システム)", text):
                pred = self.le.transform(["operating_systems"])[0]
            elif re.search(r"(暗号|セキュリティ)", text):
                pred = self.le.transform(["computer_security"])[0]
            elif re.search(r"(アプリケーション|サービス)", text):
                pred = self.le.transform(["application_service"])[0]
            elif re.search(r"(コンピュータ|ソフトウェア)", text):
                pred = self.le.transform(["computer_software"])[0]
            elif re.search(r"(人工|知能)", text):
                pred = self.le.transform(["artificial_intelligence"])[0]
            elif re.search(r"(検索|エンジン)", text):
                pred = self.le.transform(["search_engine"])[0]
            elif re.search(r"(情報|社会)", text):
                pred = self.le.transform(["information_society"])[0]
            result.append(pred)
        return result

rulebased = RulebasedEstimator(label_encoder=le)

rulebased_pred = rulebased.predict(X_test)


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,rulebased_pred)


print(classification_report(y_test, rulebased_pred, target_names=le.classes_))


#Import knearest neighbors Classifier model
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
random_state = 0
nca = make_pipeline(StandardScaler(),NeighborhoodComponentsAnalysis(n_components=2,random_state=random_state))

neigh=30
print("KNN_neighbour="+str(neigh)+"\n")
pca = make_pipeline(StandardScaler(),PCA(n_components=2, random_state=random_state))

# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
                    PCA(n_components=2, random_state=random_state))

# Reduce dimension to 2 with LinearDiscriminantAnalysis
lda = make_pipeline(StandardScaler(),
                    LinearDiscriminantAnalysis(n_components=2))

knn = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ('nca',nca),
    ("clf",  KNeighborsClassifier(n_neighbors=30,algorithm='kd_tree')),
])
#Train the model using the training sets
pca.fit(X_train, y_train)

knn.fit(model.transform(X_train), y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

error = []


# Calculating error for K values between 1 and 10
for i in range(1,40):
    knn = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf",  KNeighborsClassifier(n_neighbors=i)),
    ])
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

я получил это значение ошибки

ValueError Traceback (последний вызов был последним) в () 145]) 146 # Обучение модели с использованием обучающих наборов -> 147 pca.fit (X_train, y_train) 148 149 knn.fit (model.transform (X_train), y_train )

12 кадров /usr/local/lib/python3.6/dist-packages/numpy/core/_asarray.py в asarray (a, dtype, order) 83 84 "" "--- > 85 возвращаемый массив (a, dtype, copy = False, order = order) 86 87

ValueError: не удалось преобразовать строку в число с плавающей точкой: '分散 シ ス テ ム: 原理 と パ ラ ダ ダ ム'

редактировать:

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf)
print(X_train_tf.shape)



################################
change the pipeline for PCA


pca = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf",  PCA(n_components=2, random_state=random_state)),
])

#Train the model using the training sets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train) #i got error X_train is string not float
pca.fit(X_train_tf)
X_train=pca.transform(X_train_tf)

knn.fit(pca.transform(X_train_tf),y_train)
...