# The objective is to find out the 7 different principal components and their explained variances. Principal component analysis is commonly used for data compression. It is possible to visualize the 7 dimensional data using the first two principal components in 2 dimensions with minimum loss of information. Plot the data in two dimensions and see if anything can be said about what kind of customers require intervention.¶

from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import csv


### Importing datafile as a list object¶

csvfile = open('customerdata.csv')
# creating a csv reader object
csvfile.close()
# removing first row from list
lst = lst[1:]
arr = np.array(lst)
data = arr.astype(float)

intervention=data[:,-1]
# removing first and last column from list
data = data[:,1:-1]
print(np.shape(data))

# normalize the data
data_min = data.min(axis=0)
data_max = data.max(axis=0)
data_norm = (data-data_min)/(data_max-data_min)

#for i in range(0,np.size(data,1)):
#    data_norm[:,i] = (data[:,i]-data_min[i])/(data_max[i]-data_min[i])

(1000, 7)


### Identifying the 7 principal components¶

print(np.shape(data_norm))
#print(data)
#print(data_norm)

#Note that PCA does centering of the data
pca = PCA(n_components=7)
pca.fit(data_norm)

(1000, 7)

PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)

#Note that the first two components are similar to customer satisfaction and customer value, respectively
print(pca.components_)

[[-0.00328525  0.46597901 -0.86740711 -0.17306732  0.010641    0.00449094
0.01928677]
[ 0.47913522  0.03476629  0.04736982 -0.05098777  0.60271426  0.40369948
0.48799523]
[ 0.05886257  0.09253122 -0.14604654  0.9819914   0.0473299   0.00367415
-0.00910255]
[-0.27764024 -0.07724586 -0.03475359  0.04297846 -0.31695811 -0.23400424
0.87101871]
[-0.06694078  0.87558675  0.47199756 -0.00546035 -0.04651038 -0.04022935
0.04768279]
[ 0.44305793  0.00355406  0.0052064  -0.03397487  0.22910469 -0.86601517
-0.00586469]
[-0.69934851 -0.02147389 -0.00384734  0.0104686   0.69229152 -0.17502684
-0.02059577]]


### Printing the variances explained by each of the principal components¶

#Pring the eigen values
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
plt.bar(x=["PC1","PC2","PC3","PC4","PC5","PC6","PC7"],height=pca.explained_variance_ratio_)
plt.show()

[0.17252592 0.10348626 0.07992296 0.05450644 0.02552347 0.01401692
0.00590595]
[0.37843932 0.22699934 0.17531275 0.11956106 0.05598628 0.03074642
0.01295483]


### Note that T = XW, where T is the new coordinate system, X is the old coordinate system and W is the weight matrix where the column vectors are the eigen vectors of the matrix X'X¶

weights = pca.components_
weights = weights.transpose()
T = data_norm.dot(weights[:,:2])

print(T)

[[-0.73739712  0.83146526]
[-0.08237654  0.33704322]
[-0.9240241   0.45278157]
...
[-0.90648213  0.58205567]
[-0.9455266   0.69657884]
[-0.03365671  0.90558889]]

plt.scatter(T[intervention==0,0],T[intervention==0,1])
plt.scatter(T[intervention==1,0],T[intervention==1,1])
plt.xlabel("Customer Satisfaction")
plt.ylabel("Customer Value")
plt.show()