Попытка заставить классификатор SKLearn KNN работать с tf-idf - PullRequest
1 голос
/ 17 апреля 2020

Я пытаюсь следить за тем, что происходит на KNN для классификации текста с использованием баллов TF-IDF с использованием образца (это не лучший образец документов и в данный момент не имеет смысла) Однако Я продолжаю получать: «не может выполнить сокращение с гибким типом». на линии прогнозирования. Что не так? Вот код:

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import neighbors

# this is a very toy example, do not try this at home unless you want to understand the usage differences
docs = ["the house had a tiny little mouse",
        "the cat saw the mouse",
        "the mouse ran away from the house",
        "the cat finally ate the mouse",
        "the end of the mouse story"
        ]

# instantiate CountVectorizer()
cv = CountVectorizer()

# this steps generates word counts for the words in your docs
word_count_vector = cv.fit_transform(docs)


tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"])

# sort ascending
df_idf.sort_values(by=['idf_weights'])

print("idf weights")
print(df_idf)

# count matrix
count_vector = cv.transform(docs)
print("count vec:")
print(count_vector)

# tf-idf scores
tf_idf_vector = tfidf_transformer.transform(count_vector)
print("tfidf vec:")
print(tf_idf_vector)

feature_names = cv.get_feature_names()

# get tfidf vector for first document
first_document_vector = tf_idf_vector[0]
print("tfidf vec for first doc")
print(first_document_vector)

# print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)
print("df:")
print(df)

from sklearn.feature_extraction.text import TfidfVectorizer

# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,4),
                     min_df = 0, stop_words = 'english', sublinear_tf=True)

# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)

print("to array:")
print(tfidf_vectorizer_vectors.toarray())

# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]

# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(),
                  columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)

# just send in all your docs here
fitted_vectorizer = tfidf_vectorizer.fit(docs)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(docs)

classes = ["had","seen","ran","ate","of"]

clf = neighbors.KNeighborsRegressor(n_neighbors=1)
clf.fit(tfidf_vectorizer_vectors, classes)

clf.predict(tfidf_vectorizer_vectors)
#clf.predict(np.array(tfidf_vectorizer_vectors, dtype=float))

...