Я планирую использовать максимальную среднюю оценку силуэта, чтобы определить точку среза и количество кластеров для ряда различных наборов объектов.К сожалению, я сталкиваюсь с ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
для некоторых наборов, которые я повторяю.Мой цикл начинается с 2 кластеров (по крайней мере, это мое намерение), поэтому я не уверен, почему я получаю эту ошибку.
Я знаю, что есть похожий вопрос здесь , но это касаетсяс K-средних и я смотрю на иерархической кластеризации.Решение состоит в том, чтобы ограничить цикл до K> = 2, что, я полагаю, я сделал (я думаю, что решение скажет мне, что я не ограничил это правильно).
Код, который я написал, работает нормальнодля многих наборов объектов, но сталкивается с проблемами для некоторых из них.Я включил сюда матрицу расстояний для одного, который работает, когда я начинаю с 3 кластеров, и другого, который работает с 7 кластерами (но не меньше).
# Imports for the reproducible example:
import numpy as np
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import fcluster, dendrogram, linkage
from sklearn import metrics
# Two of the matrices in question:
D1=np.array([[0, 0.93, 0.61, 0.81, 0.76, 0.77, 0.95, 0.91, 0.66, 0.77, 0.20, 0.55, 0.95, 0.77, 0.95, 0.72, 0.71, 0.94, 0.89, 0.17, 0.86, 0.84, 0.87],
[0.93, 0, 0.92, 0.95, 0.94, 0.75, 0.83, 0.77, 0.94, 0.90, 0.20, 0.85, 0.84, 0.81, 0.82, 0.93, 0.91, 0.86, 0.91, 0.33, 0.83, 0.88, 0.86],
[0.61, 0.92, 0, 0.62, 0.86, 0.65, 0.86, 0.91, 0.80, 0.71, 0.20, 0.65, 0.84, 0.81, 0.86, 0.72, 0.86, 0.87, 0.89, 0.33, 0.85, 0.85, 0.89],
[0.81, 0.95, 0.62, 0, 0.78, 0.25, 0.90, 0.84, 0.67, 0.17, 0.20, 0.57, 0.77, 0.50, 0.87, 0.70, 0.68, 0.90, 0.76, 0.40, 0.69, 0.77, 0.70],
[0.76, 0.94, 0.86, 0.78, 0, 0.74, 0.95, 0.92, 0.78, 0.56, 0.20, 0.82, 0.93, 0.76, 0.96, 0.40, 0.80, 0.92, 0.92, 0.33, 0.86, 0.82, 0.90],
[0.77, 0.75, 0.65, 0.25, 0.74, 0, 0.78, 0.76, 0.50, 0.44, 0, 0.69, 0.72, 0.20, 0.82, 0.79, 0.33, 0.63, 0.75, 0.20, 0.80, 0.33, 0.70],
[0.95, 0.83, 0.86, 0.90, 0.95, 0.78, 0, 0.92, 0.89, 0.90, 0.40, 0.86, 0.89, 0.79, 0.47, 0.96, 0.85, 0.85, 0.86, 0.17, 0.77, 0.84, 0.89],
[0.91, 0.77, 0.91, 0.84, 0.92, 0.76, 0.92, 0, 0.88, 0.88, 0.20, 0.80, 0.27, 0.72, 0.93, 0.91, 0.88, 0.79, 0.89, 0.33, 0.68, 0.90, 0.51],
[0.66, 0.94, 0.80, 0.67, 0.78, 0.50, 0.89, 0.88, 0, 0.69, 0.20, 0.57, 0.91, 0.55, 0.95, 0.77, 0.49, 0.89, 0.88, 0.20, 0.72, 0.88, 0.81],
[0.77, 0.90, 0.71, 0.17, 0.56, 0.44, 0.90, 0.88, 0.69, 0, 0.20, 0.74, 0.87, 0.62, 0.90, 0.53, 0.43, 0.86, 0.89, 0, 0.71, 0.74, 0.87],
[0.20, 0.20, 0.20, 0.20, 0.20, 0, 0.40, 0.20, 0.20, 0.20, 0, 0, 0, 0.25, 0.40, 0.20, 0.20, 0.40, 0.20, 0, 0, 0.20, 0],
[0.55, 0.85, 0.65, 0.57, 0.82, 0.69, 0.86, 0.80, 0.57, 0.74, 0, 0, 0.86, 0.76, 0.89, 0.75, 0.62, 0.84, 0.78, 0, 0.53, 0.82, 0.73],
[0.95, 0.84, 0.84, 0.77, 0.93, 0.72, 0.89, 0.27, 0.91, 0.87, 0, 0.86, 0, 0.79, 0.90, 0.91, 0.82, 0.82, 0.87, 0, 0.79, 0.84, 0.48],
[0.77, 0.81, 0.81, 0.50, 0.76, 0.20, 0.79, 0.72, 0.55, 0.62, 0.25, 0.76, 0.79, 0, 0.84, 0.79, 0.38, 0.81, 0.83, 0.20, 0.56, 0.39, 0.83],
[0.95, 0.82, 0.86, 0.87, 0.96, 0.82, 0.47, 0.93, 0.95, 0.90, 0.40, 0.89, 0.90, 0.84, 0, 0.94, 0.94, 0.88, 0.72, 0.40, 0.76, 0.87, 0.95],
[0.72, 0.93, 0.72, 0.70, 0.40, 0.79, 0.96, 0.91, 0.77, 0.53, 0.20, 0.75, 0.91, 0.79, 0.94, 0, 0.82, 0.96, 0.91, 0.17, 0.86, 0.89, 0.94],
[0.71, 0.91, 0.86, 0.68, 0.80, 0.33, 0.85, 0.88, 0.49, 0.43, 0.20, 0.62, 0.82, 0.38, 0.94, 0.82, 0, 0.86, 0.78, 0.20, 0.75, 0.85, 0.81],
[0.94, 0.86, 0.87, 0.90, 0.92, 0.63, 0.85, 0.79, 0.89, 0.86, 0.40, 0.84, 0.82, 0.81, 0.88, 0.96, 0.86, 0, 0.50, 0.40, 0.56, 0.66, 0.54],
[0.89, 0.91, 0.89, 0.76, 0.92, 0.75, 0.86, 0.89, 0.88, 0.89, 0.20, 0.78, 0.87, 0.83, 0.72, 0.91, 0.78, 0.50, 0, 0.20, 0.52, 0.71, 0.77],
[0.17, 0.33, 0.33, 0.40, 0.33, 0.20, 0.17, 0.33, 0.20, 0, 0, 0, 0, 0.20, 0.40, 0.17, 0.20, 0.40, 0.20, 0, 0, 0.33, 0.20],
[0.86, 0.83, 0.85, 0.69, 0.86, 0.80, 0.77, 0.68, 0.72, 0.71, 0, 0.53, 0.79, 0.56, 0.76, 0.86, 0.75, 0.56, 0.52, 0, 0, 0.82, 0.75],
[0.84, 0.88, 0.85, 0.77, 0.82, 0.33, 0.84, 0.90, 0.88, 0.74, 0.20, 0.82, 0.84, 0.39, 0.87, 0.89, 0.85, 0.66, 0.71, 0.33, 0.82, 0, 0.76],
[0.87, 0.86, 0.89, 0.70, 0.90, 0.70, 0.89, 0.51, 0.81, 0.87, 0, 0.73, 0.48, 0.83, 0.95, 0.94, 0.81, 0.54, 0.77, 0.20, 0.75, 0.76, 0]])
D2=np.array([[0, 0.91, 0.71, 0.34, 0.63, 0.93, 0.95, 0.96, 0.84, 0.88, 0.90, 0.40, 0.91, 0.89, 0.76, 0.95, 0.90, 0.92, 0.73, 0.95, 0.88, 0.77, 0.85],
[0.91, 0, 0.75, 0.85, 0.38, 0.91, 0.29, 0.79, 0.95, 0.92, 0.96, 0.20, 0.94, 0.94, 0.61, 0.96, 0.91, 0.95, 0.36, 0.97, 0.97, 0.85, 0.93],
[0.71, 0.75, 0, 0.76, 0.63, 0.93, 0.91, 0.95, 0.92, 0.94, 0.94, 0.40, 0.96, 0.95, 0.13, 0.92, 0.93, 0.96, 0.64, 0.91, 0.95, 0.88, 0.95],
[0.34, 0.85, 0.76, 0, 0.50, 0.89, 0.92, 0.92, 0.80, 0.87, 0.93, 0.40, 0.95, 0.97, 0.81, 0.98, 0.89, 0.95, 0.60, 0.94, 0.73, 0.88, 0.93],
[0.63, 0.38, 0.63, 0.50, 0, 0.38, 0.33, 0.38, 0.50, 0.50, 0.57, 0.25, 0.43, 0.50, 0.63, 0.43, 0.50, 0.17, 0, 0.50, 0.29, 0.50, 0.43],
[0.93, 0.91, 0.93, 0.89, 0.38, 0, 0.89, 0.79, 0.81, 0.73, 0.91, 0.20, 0.69, 0.81, 0.91, 0.92, 0.90, 0.88, 0.33, 0.88, 0.90, 0.82, 0.85],
[0.95, 0.29, 0.91, 0.92, 0.33, 0.89, 0, 0.88, 0.92, 0.93, 0.93, 0.40, 0.90, 0.95, 0.88, 0.95, 0.81, 0.93, 0.25, 0.95, 0.94, 0.85, 0.86],
[0.96, 0.79, 0.95, 0.92, 0.38, 0.79, 0.88, 0, 0.89, 0.81, 0.91, 0.40, 0.85, 0.92, 0.97, 0.90, 0.51, 0.86, 0.42, 0.95, 0.92, 0.59, 0.79],
[0.84, 0.95, 0.92, 0.80, 0.50, 0.81, 0.92, 0.89, 0, 0.65, 0.93, 0.40, 0.75, 0.94, 0.96, 0.93, 0.90, 0.62, 0.64, 0.95, 0.70, 0.84, 0.89],
[0.88, 0.92, 0.94, 0.87, 0.50, 0.73, 0.93, 0.81, 0.65, 0, 0.94, 0.20, 0.65, 0.90, 0.97, 0.96, 0.90, 0.78, 0.64, 0.95, 0.68, 0.85, 0.92],
[0.90, 0.96, 0.94, 0.93, 0.57, 0.91, 0.93, 0.91, 0.93, 0.94, 0, 0.40, 0.94, 0.49, 0.93, 0.68, 0.88, 0.90, 0.67, 0.70, 0.92, 0.82, 0.74],
[0.40, 0.20, 0.40, 0.40, 0.25, 0.20, 0.40, 0.40, 0.40, 0.20, 0.40, 0, 0.20, 0.40, 0.40, 0.40, 0.40, 0, 0.25, 0.40, 0.40, 0.25, 0.25],
[0.91, 0.94, 0.96, 0.95, 0.43, 0.69, 0.90, 0.85, 0.75, 0.65, 0.94, 0.20, 0, 0.93, 0.95, 0.94, 0.88, 0.53, 0.56, 0.96, 0.76, 0.84, 0.90],
[0.89, 0.94, 0.95, 0.97, 0.50, 0.81, 0.95, 0.92, 0.94, 0.90, 0.49, 0.40, 0.93, 0, 0.94, 0.54, 0.91, 0.94, 0.50, 0.67, 0.97, 0.92, 0.80],
[0.76, 0.61, 0.13, 0.81, 0.63, 0.91, 0.88, 0.97, 0.96, 0.97, 0.93, 0.40, 0.95, 0.94, 0, 0.94, 0.93, 0.96, 0.64, 0.95, 0.94, 0.92, 0.91],
[0.95, 0.96, 0.92, 0.98, 0.43, 0.92, 0.95, 0.90, 0.93, 0.96, 0.68, 0.40, 0.94, 0.54, 0.94, 0, 0.88, 0.95, 0.60, 0.26, 0.95, 0.93, 0.79],
[0.90, 0.91, 0.93, 0.89, 0.50, 0.90, 0.81, 0.51, 0.90, 0.90, 0.88, 0.40, 0.88, 0.91, 0.93, 0.88, 0, 0.95, 0.75, 0.92, 0.96, 0.39, 0.65],
[0.92, 0.95, 0.96, 0.95, 0.17, 0.88, 0.93, 0.86, 0.62, 0.78, 0.90, 0, 0.53, 0.94, 0.96, 0.95, 0.95, 0, 0.60, 0.92, 0.39, 0.93, 0.94],
[0.73, 0.36, 0.64, 0.60, 0, 0.33, 0.25, 0.42, 0.64, 0.64, 0.67, 0.25, 0.56, 0.50, 0.64, 0.60, 0.75, 0.60, 0, 0.60, 0.64, 0.50, 0.56],
[0.95, 0.97, 0.91, 0.94, 0.50, 0.88, 0.95, 0.95, 0.95, 0.95, 0.70, 0.40, 0.96, 0.67, 0.95, 0.26, 0.92, 0.92, 0.60, 0, 0.91, 0.90, 0.90],
[0.88, 0.97, 0.95, 0.73, 0.29, 0.90, 0.94, 0.92, 0.70, 0.68, 0.92, 0.40, 0.76, 0.97, 0.94, 0.95, 0.96, 0.39, 0.64, 0.91, 0, 0.88, 0.91],
[0.77, 0.85, 0.88, 0.88, 0.50, 0.82, 0.85, 0.59, 0.84, 0.85, 0.82, 0.25, 0.84, 0.92, 0.92, 0.93, 0.39, 0.93, 0.50, 0.90, 0.88, 0, 0.58],
[0.85, 0.93, 0.95, 0.93, 0.43, 0.85, 0.86, 0.79, 0.89, 0.92, 0.74, 0.25, 0.90, 0.80, 0.91, 0.79, 0.65, 0.94, 0.56, 0.90, 0.91, 0.58, 0]])
# I create the linkage matrix from each (these are probably easier to look at than the distance matrices):
L1=linkage(squareform(D1))
L2=linkage(squareform(D2))
# The loop I use to determine the value of the silhouette for each cluster size:
silhouette1=[]
for i in range(3,L1.shape[0]):
temp=fcluster(L1,i,criterion='maxclust')
s=metrics.silhouette_score(D1,temp,metric='euclidean')
silhouette1.append(s)
silhouette2=[]
for i in range(7,L2.shape[0]):
temp=fcluster(L2,i,criterion='maxclust')
s=metrics.silhouette_score(D2,temp,metric='euclidean')
silhouette2.append(s)
# The above loops work fine, as I have restricted the above to the range it seems to want. I am not sure why this works, but the following do not work:
silhouette1b=[]
for i in range(2,L1.shape[0]):
temp=fcluster(L1,i,criterion='maxclust')
s=metrics.silhouette_score(D1,temp,metric='euclidean')
silhouette1b.append(s)
silhouette2b=[]
for i in range(2,L2.shape[0]):
temp=fcluster(L2,i,criterion='maxclust')
s=metrics.silhouette_score(D2,temp,metric='euclidean')
silhouette2b.append(s)
Мое чтение этого таково по какой-то причине (знаниеПо этой причине следует ответить на мой вопрос), оценка силуэта не будет рассчитываться только для 2 кластеров.Тем не менее, ошибка, по-видимому, подразумевает, что это возможно, но я не определил 2 кластера должным образом, установив i для запуска в 2.