import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as kmeans
import numpy as np
import pandas as pd
def readFileThroughPandas(filename):
csvfile = pd.read_csv(filename)
## Use loc to choose all rows and two columns
arr = csvfile.loc[:,["X","Y"]]
## Or simply use the following to choose a list of columns
#arr = csvfile[["X","Y"]]
return(arr)
#arr = readFileThroughCSV("citydata.csv")
arr = readFileThroughPandas("citydata.csv")
#Now arr contains the data as a list two stacked columns of X and Y coordinates
#print(arr)
plt.scatter(arr["X"],arr["Y"])
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
kmeans_model = kmeans(n_clusters=4)
kmeans_model.fit(arr)
#Each point gets a label based on the cluster to which it belongs to
lab = kmeans_model.labels_
#Each clustrer has a centroid
centroid = kmeans_model.cluster_centers_
#Total within-cluster sum of squares (wss)
inertia = kmeans_model.inertia_
#print(inertia)
#print(lab)
# Plot in single line
plt.scatter(arr["X"],arr["Y"], c=lab)
#Or plot one by one as follows
#for i in range(max(lab)+1):
# arrNew = arr[lab==i]
# plt.plot(arrNew[:,0],arrNew[:,1],'*')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
diction = {
"X": arr["X"],
"Y": arr["Y"],
"Labels": lab
}
df = pd.DataFrame(data=diction)
df.to_csv('output.csv')
#df.to_excel('output.xlsx')
#Perform clustering for n_clusters from 2 to 25
num_clusters = np.arange(2,26)
i = 0;
n_inertia = np.zeros(num_clusters.shape)
for n in num_clusters:
kmeans_model = kmeans(n_clusters=n)
kmeans_model.fit(arr)
n_inertia[i] = kmeans_model.inertia_
i=i+1
plt.plot(num_clusters,n_inertia)
plt.xlabel("Number of Clusters")
plt.ylabel("Total within sum of squares")
plt.show()