Возможно это.
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering
sentences = [["The Lord of the Rings J.R.R Tolkien"],
["Lord of the Rings Good condition"],
["Very good condition Lord of the Rings jrr Tolkien"],
["harry potter and the sorcerer's stone hardcover"],
["JK rowling harry potter and the sorcerer's stone"]]
m = Word2Vec(sentences, size=50, min_count=1, sg=1)
def vectorizer(sent, m):
vec=[]
numw=0
for w in sent:
try:
if numw == 0:
vec = m[w]
else:
vec = np.add(vec, m[w])
numw += 1
except:
pass
return np.asarray(vec)/numw
l=[]
for i in sentences:
l.append(vectorizer(i,m))
X=np.array(l)
n_clusters = 2
clf = KMeans(n_clusters=n_clusters,
max_iter=100,
init='k-means++',
n_init=1)
labels=clf.fit_predict(X)
print(labels)
for index, sentence in enumerate(sentences):
print(str(labels[index]) + ":" + str(sentence))
Результат:
0:['The Lord of the Rings J.R.R Tolkien']
0:['Lord of the Rings Good condition']
1:['Very good condition Lord of the Rings jrr Tolkien']
0:["harry potter and the sorcerer's stone hardcover"]
1:["JK rowling harry potter and the sorcerer's stone"]
KMeans почти наверняка не лучший способ кластеризации любых текстовых данных. Вы можете также взглянуть на другие алгоритмы кластеризации. В этом случае агломеративная кластеризация может быть более надежной.
Это интересно.
Например, если я изменю это ...
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
clf = AgglomerativeClustering(n_clusters=n_clusters,
linkage="average", affinity=metric)
Я понял это .. .
1:['The Lord of the Rings J.R.R Tolkien']
0:['Lord of the Rings Good condition']
0:['Very good condition Lord of the Rings jrr Tolkien']
0:["harry potter and the sorcerer's stone hardcover"]
0:["JK rowling harry potter and the sorcerer's stone"]