Я пытаюсь следить за тем, что происходит на KNN для классификации текста с использованием баллов TF-IDF с использованием образца (это не лучший образец документов и в данный момент не имеет смысла) Однако Я продолжаю получать: «не может выполнить сокращение с гибким типом». на линии прогнозирования. Что не так? Вот код:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import neighbors
# this is a very toy example, do not try this at home unless you want to understand the usage differences
docs = ["the house had a tiny little mouse",
"the cat saw the mouse",
"the mouse ran away from the house",
"the cat finally ate the mouse",
"the end of the mouse story"
]
# instantiate CountVectorizer()
cv = CountVectorizer()
# this steps generates word counts for the words in your docs
word_count_vector = cv.fit_transform(docs)
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"])
# sort ascending
df_idf.sort_values(by=['idf_weights'])
print("idf weights")
print(df_idf)
# count matrix
count_vector = cv.transform(docs)
print("count vec:")
print(count_vector)
# tf-idf scores
tf_idf_vector = tfidf_transformer.transform(count_vector)
print("tfidf vec:")
print(tf_idf_vector)
feature_names = cv.get_feature_names()
# get tfidf vector for first document
first_document_vector = tf_idf_vector[0]
print("tfidf vec for first doc")
print(first_document_vector)
# print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)
print("df:")
print(df)
from sklearn.feature_extraction.text import TfidfVectorizer
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,4),
min_df = 0, stop_words = 'english', sublinear_tf=True)
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
print("to array:")
print(tfidf_vectorizer_vectors.toarray())
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(),
columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
# just send in all your docs here
fitted_vectorizer = tfidf_vectorizer.fit(docs)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(docs)
classes = ["had","seen","ran","ate","of"]
clf = neighbors.KNeighborsRegressor(n_neighbors=1)
clf.fit(tfidf_vectorizer_vectors, classes)
clf.predict(tfidf_vectorizer_vectors)
#clf.predict(np.array(tfidf_vectorizer_vectors, dtype=float))