from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import csv
csvfile = open('customerdata.csv')
# creating a csv reader object
readerobject = csv.reader(csvfile, delimiter=',')
lst = list(readerobject)
csvfile.close()
# removing first row from list
lst = lst[1:]
arr = np.array(lst)
data = arr.astype(float)
intervention=data[:,-1]
# removing first and last column from list
data = data[:,1:-1]
print(np.shape(data))
# normalize the data
data_min = data.min(axis=0)
data_max = data.max(axis=0)
data_norm = (data-data_min)/(data_max-data_min)
#for i in range(0,np.size(data,1)):
# data_norm[:,i] = (data[:,i]-data_min[i])/(data_max[i]-data_min[i])
print(np.shape(data_norm))
#print(data)
#print(data_norm)
#Note that PCA does centering of the data
pca = PCA(n_components=7)
pca.fit(data_norm)
#Note that the first two components are similar to customer satisfaction and customer value, respectively
print(pca.components_)
#Pring the eigen values
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
plt.bar(x=["PC1","PC2","PC3","PC4","PC5","PC6","PC7"],height=pca.explained_variance_ratio_)
plt.show()
weights = pca.components_
weights = weights.transpose()
T = data_norm.dot(weights[:,:2])
print(T)
plt.scatter(T[intervention==0,0],T[intervention==0,1])
plt.scatter(T[intervention==1,0],T[intervention==1,1])
plt.xlabel("Customer Satisfaction")
plt.ylabel("Customer Value")
plt.show()