Что-то подобное должно сделать это для вас.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import csv
df = pd.read_csv('C:\\your_path\\properties_2017.csv')
# df.head(10)
df = df.head(10000)
df.shape
df.dropna(axis=0,how='any',subset=['latitude','longitude'],inplace=True)
# Variable with the Longitude and Latitude
X=df.loc[:,['parcelid','latitude','longitude']]
X.head(10)
K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i)
for i in K_clusters]
Y_axis = df[['latitude']]
X_axis = df[['longitude']]
score = [kmeans[i].fit(Y_axis).score(Y_axis)
for i in range(len(kmeans))] # Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
kmeans = KMeans(n_clusters = 3, init ='k-means++')
kmeans.fit(X[X.columns[1:3]]) # Compute k-means clustering.X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])centers = kmeans.cluster_centers_ # Coordinates of cluster centers.labels = kmeans.predict(X[X.columns[1:3]]) # Labels of each pointX.head(10)
X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
labels = kmeans.predict(X[X.columns[1:3]]) # Labels of each pointX.head(10)
X.head(5)
X = X[['parcelid','cluster_label']]
X.head(5)
parcelid cluster_label
0 10754147 0
1 10759547 0
2 10843547 2
3 10859147 2
4 10879947 2
clustered_data = df.merge(X, left_on='parcelid', right_on='parcelid')
clustered_data.head(5)
centers = kmeans.cluster_centers_
print(centers)
X=df.loc[:,['parcelid','latitude','longitude']]
X.plot.scatter(x = 'latitude', y = 'longitude', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
Источник данных:
https://www.kaggle.com/xxing9703/kmean-clustering-of-latitude-and-longitude/data