Я никогда не слышал о минимальном числе, кроме 1 или 2, возможно. Конечно, вы можете получить «оптимальное» количество кластеров, используя метод кривой локтя. Вот отличный пример, используя данные с фондового рынка.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as sco
import datetime as dt
import math
from pylab import plot,show
from numpy import vstack,array
from numpy.random import rand
from math import sqrt
from numpy import vstack,array
from datetime import datetime, timedelta
from pandas_datareader import data as wb
from scipy.cluster.vq import kmeans,vq
np.random.seed(777)
start = '2020-1-01'
end = '2020-3-27'
tickers = ['MMM',
'ABT',
'ABBV',
'ABMD',
'ACN',
'ATVI',
'ADBE',
'AMD',
'AAP',
'AES',
'AMG',
'AFL',
'A',
'APD',
'AKAM',
'ALK',
'ALB',
'ARE',
'ALXN',
'ALGN',
'ALLE',
'AGN',
'ADS',
'ALL',
'GOOGL',
'GOOG',
'MO',
'AMZN',
'AEP',
'AXP',
'AIG',
'AMT',
'AWK',
'AMP',
'ABC',
'AME',
'AMGN',
'APH',
'ADI',
'ANSS',
'ANTM',
'AON',
'AOS',
'APA',
'AIV',
'AAPL',
'AMAT',
'APTV',
'ADM',
'ARNC',
'ADSK',
'ADP',
'AZO',
'AVB',
'AVY',
'ZBH',
'ZION',
'ZTS']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Adj Close']])
df = pd.concat(price_data)
df.dtypes
df.head()
df.shape
pd.set_option('display.max_columns', 500)
df = df.reset_index()
df = df.set_index('Date')
table = df.pivot(columns='ticker')
# By specifying col[1] in below list comprehension
# You can select the stock names under multi-level column
table.columns = [col[1] for col in table.columns]
table.head()
#Calculate average annual percentage return and volatilities over a theoretical one year period
returns = table.pct_change().mean() * 252
returns = pd.DataFrame(returns)
returns.columns = ['Returns']
returns['Volatility'] = table.pct_change().std() * math.sqrt(252)
#format the data as a numpy array to feed into the K-Means algorithm
data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T
X = data
distorsions = []
for k in range(2, 20):
k_means = kmeans(n_clusters=k)
k_means.fit(X)
distorsions.append(k_means.inertia_)
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 20), distorsions)
plt.grid(True)
plt.title('Elbow curve')
[![enter image description here][1]][1]
# computing K-Means with K = 5 (5 clusters)
# computing K-Means with K = 5 (5 clusters)
centroids,_ = KMeans(data,15)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
kmeans = KMeans(n_clusters=15)
kmeans.fit(data)
y_kmeans = kmeans.predict(data)
viridis = cm.get_cmap('viridis', 15)
for i in range(0, len(data)):
plt.scatter(data[i,0], data[i,1], c=viridis(y_kmeans[i]), s= 50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5)
![enter image description here](https://i.stack.imgur.com/STTO2.png)
Вы можете выполнить поиск в Google и найти все виды информации на эта концепция. Вот одна ссылка, с которой можно начать.
https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f