Я пытаюсь выполнить кластерный анализ k-средних на наборе данных для взрослых UCI.
X = np.array(df.drop(['class'], 1).astype(int))
y = np.array(df['class'])
km = KMeans(n_clusters=3)
y_km=km.fit_predict(X)
plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', marker='s', label='cluster 1')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')
график:
Я уверен, что упускаю что-то очевидное , но я потратил на это много времени. Любая помощь будет принята с благодарностью.
набор данных: https://archive.ics.uci.edu/ml/datasets/adult
Я должен использовать k-средства, поскольку это курсовая работа.
MCVE:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
features=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain","capital-loss", "hours-per-week", "native-country", "class"]
df = pd.read_csv("/kaggle/input/data-adult/adult.data", names=features)
df['class'] = df["class"].apply(lambda x:0 if x==' <=50K' else 1)
labelEncoder = LabelEncoder()
labelEncoder.fit(df['sex'])
df['sex'] = labelEncoder.transform(df['sex'])
labelEncoder.fit(df['occupation'])
df['occupation'] = labelEncoder.transform(df['occupation'])
labelEncoder.fit(df['workclass'])
df['workclass'] = labelEncoder.transform(df['workclass'])
labelEncoder.fit(df['education'])
df['education'] = labelEncoder.transform(df['education'])
labelEncoder.fit(df['marital-status'])
df['marital-status'] = labelEncoder.transform(df['marital-status'])
labelEncoder.fit(df['relationship'])
df['relationship'] = labelEncoder.transform(df['relationship'])
labelEncoder.fit(df['race'])
df['race'] = labelEncoder.transform(df['race'])
labelEncoder.fit(df['native-country'])
df['native-country'] = labelEncoder.transform(df['native-country'])
X = np.array(df.drop(['class'], 1).astype(int))
y = np.array(df['class'])
km = KMeans(n_clusters=3)
y_km=km.fit_predict(X)
plt.scatter(X[y_km==0,0], X[y_km==0,1], s=50, c='lightgreen', label='cluster 1')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')