Я работаю с набором данных и пытаюсь изучить кластеризацию Kmeans, я работаю со следующим кодом:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Create Points to cluster
Points = pd.DataFrame()
Points.loc[:,0] = [243,179,152,255,166,162,233,227,204,341,283,202,217,197,191,114,
Points.loc[:,1] = [2.1,4,2.6,2.1,2.5,0.4,0.3,4.9,1.1,1,-1.5,3.3,2.2,1.9,2.4,2.2,0.9,
# Create initial cluster centroids
ClusterCentroidGuesses = pd.DataFrame()
ClusterCentroidGuesses.loc[:,0] = [100, 200, 0]
ClusterCentroidGuesses.loc[:,1] = [2, -2, 0]
def Plot2DKMeans(Points, Labels, ClusterCentroids, Title):
for LabelNumber in range(max(Labels)+1):
LabelFlag = Labels == LabelNumber
color = ['c', 'm', 'y', 'b', 'g', 'r', 'c', 'm', 'y',
'b', 'g', 'r', 'c', 'm', 'y'][LabelNumber]
marker = ['s', 'o', 'v', '^', '<', '>', '8', 'p', '*',
'h', 'H', 'D', 'd', 'P', 'X'][LabelNumber]
plt.scatter(Points.loc[LabelFlag,0], Points.loc[LabelFlag,1],
s= 100, c=color, edgecolors="black", alpha=0.3, marker=marker)
s=200, c="black", marker=marker)
def KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2):
PointsNorm = Points.copy()
ClusterCentroids = ClusterCentroidGuesses.copy()
if NormD1:
# Determine mean of 1st dimension
mean1 = np.mean(PointsNorm[:,0])
# Determine standard deviation of 1st dimension
std1 = np.std(PointsNorm[:,0])
# Normalize 1st dimension of Points
PointsNorm[:,0] = ((PointsNorm[:,0] - mean1)/std1)
# Normalize 1st dimension of ClusterCentroids
Cmean1 = np.mean(ClusterCentroids[:,0])
Cstd1 = np.std(ClusterCentroids[:,0])
ClusterCentroids[:,0] = ((ClusterCentroids[:,0] - Cmean1)/Cstd1)
if NormD2:
# Determine mean of 2nd dimension
mean2 = np.mean(PointsNorm[:,1])
# Determine standard deviation of 2nd dimension
std2 = np.std(PointsNorm[:,1])
# Normalize 2nd dimension of Points
PointsNorm[:,1] = ((PointsNorm[:,1] - mean2)/std2)
# Normalize 2nd dimension of ClusterCentroids
Cmean2 = np.mean(ClusterCentroids[:,1])
Cstd2 = np.std(ClusterCentroids[:,1])
ClusterCentroids[:,1] = ((ClusterCentroids[:,1] - Cmean2)/Cstd2)
# Do actual clustering
kmeans = KMeans(n_clusters=3, init=ClusterCentroidGuesses, n_init=1).fit(PointsNorm)
Labels = kmeans.labels_
ClusterCentroids = pd.DataFrame(kmeans.cluster_centers_)
if NormD1:
# Denormalize 1st dimension
PointsNorm[:,0] = PointsNorm[:,0]*std1+mean1
ClusterCentroids[:,0] = ClusterCentroids[:0]*Cstd1+Cmean1
if NormD2:
# Denormalize 2nd dimension
PointsNorm[:,1] = PointsNorm[:,1]*std2+mean2
ClusterCentroids[:,1] = ClusterCentroids[:1]*Cstd2+Cmean2
return Labels, ClusterCentroids
# Compare distributions of the two dimensions
plt.rcParams["figure.figsize"] = [6.0, 4.0] # Standard
plt.hist(Points.loc[:,0], bins = 20, color=[0, 0, 1, 0.5])
plt.hist(Points.loc[:,1], bins = 20, color=[1, 1, 0, 0.5])
plt.title("Compare Distributions")
# Change the plot dimensions
plt.rcParams["figure.figsize"] = [8, 8] # Square
# plt.rcParams["figure.figsize"] = [8, 0.5] # Wide
# plt.rcParams["figure.figsize"] = [0.5, 8] # Tall
# Cluster without normalization
# Are the points separated into clusters along one or both dimensions?
# Which dimension separates the points into clusters?
# Set Normalizations
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)
# Set Normalizations
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)
При попытке построить график NormD1=True
Я получаю код ошибки, который читает
TypeError: '(slice(None, None, None), 0)' is an invalid key
Может кто-нибудь помочь мне понять, где я иду не так?