import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as kmeans
import numpy as np


import pandas as pd
def readFileThroughPandas(filename):
    csvfile = pd.read_csv(filename)

    ## Use loc to choose all rows and two columns
    arr = csvfile.loc[:,["X","Y"]]
    ## Or simply use the following to choose a list of columns
    #arr = csvfile[["X","Y"]]
    
    return(arr)


#arr = readFileThroughCSV("citydata.csv")
arr = readFileThroughPandas("citydata.csv")

#Now arr contains the data as a list two stacked columns of X and Y coordinates
#print(arr)

plt.scatter(arr["X"],arr["Y"])
plt.xlabel("X")
plt.ylabel("Y")
plt.show()


kmeans_model = kmeans(n_clusters=4)
kmeans_model.fit(arr)

#Each point gets a label based on the cluster to which it belongs to
lab = kmeans_model.labels_

#Each clustrer has a centroid
centroid = kmeans_model.cluster_centers_

#Total within-cluster sum of squares (wss)
inertia = kmeans_model.inertia_

#print(inertia)
#print(lab)


# Plot in single line
plt.scatter(arr["X"],arr["Y"], c=lab)

#Or plot one by one as follows
#for i in range(max(lab)+1):
#    arrNew = arr[lab==i]
#    plt.plot(arrNew[:,0],arrNew[:,1],'*')

plt.xlabel("X")
plt.ylabel("Y")
plt.show()


diction = {
    "X": arr["X"],
    "Y": arr["Y"],
    "Labels": lab
}

df = pd.DataFrame(data=diction)
df.to_csv('output.csv')
#df.to_excel('output.xlsx')


#Perform clustering for n_clusters from 2 to 25
num_clusters = np.arange(2,26)
i = 0;
n_inertia = np.zeros(num_clusters.shape)
for n in num_clusters:
    kmeans_model = kmeans(n_clusters=n)
    kmeans_model.fit(arr)
    n_inertia[i] = kmeans_model.inertia_
    i=i+1

plt.plot(num_clusters,n_inertia)
plt.xlabel("Number of Clusters")
plt.ylabel("Total within sum of squares")
plt.show()

The dataset citydata.csv contains the coordinates of 60 locations from which we want to create 4 clusters.¶

Importing libraries¶

Reading file through the Pandas package¶

Read the CSV file using the above function and plot the data¶

Perform clustering¶

Plotting the four clusters in different colours¶

Storing the information in a file¶

Elbow plot to determine optimal number of clusters¶

Note that in the figure above, the elbow is around 5¶