# The objective is to find out the 7 different principal components and their explained variances. Principal component analysis is commonly used for data compression. It is possible to visualize the 7 dimensional data using the first two principal components in 2 dimensions with minimum loss of information. Plot the data in two dimensions and see if anything can be said about what kind of customers require intervention.¶

In [89]:
from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import csv


### Importing datafile as a list object¶

In [90]:
csvfile = open('customerdata.csv')
# creating a csv reader object
csvfile.close()
# removing first row from list
lst = lst[1:]
arr = np.array(lst)
data = arr.astype(float)

intervention=data[:,-1]
# removing first and last column from list
data = data[:,1:-1]
print(np.shape(data))

# normalize the data
data_min = data.min(axis=0)
data_max = data.max(axis=0)
data_norm = (data-data_min)/(data_max-data_min)

#for i in range(0,np.size(data,1)):
#    data_norm[:,i] = (data[:,i]-data_min[i])/(data_max[i]-data_min[i])

(1000, 7)


### Identifying the 7 principal components¶

In [91]:
print(np.shape(data_norm))
#print(data)
#print(data_norm)

#Note that PCA does centering of the data
pca = PCA(n_components=7)
pca.fit(data_norm)

(1000, 7)

Out[91]:
PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)

In [92]:
#Note that the first two components are similar to customer satisfaction and customer value, respectively
print(pca.components_)

[[-0.00328525  0.46597901 -0.86740711 -0.17306732  0.010641    0.00449094
0.01928677]
[ 0.47913522  0.03476629  0.04736982 -0.05098777  0.60271426  0.40369948
0.48799523]
[ 0.05886257  0.09253122 -0.14604654  0.9819914   0.0473299   0.00367415
-0.00910255]
[-0.27764024 -0.07724586 -0.03475359  0.04297846 -0.31695811 -0.23400424
0.87101871]
[-0.06694078  0.87558675  0.47199756 -0.00546035 -0.04651038 -0.04022935
0.04768279]
[ 0.44305793  0.00355406  0.0052064  -0.03397487  0.22910469 -0.86601517
-0.00586469]
[-0.69934851 -0.02147389 -0.00384734  0.0104686   0.69229152 -0.17502684
-0.02059577]]


### Printing the variances explained by each of the principal components¶

In [93]:
#Pring the eigen values
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
plt.bar(x=["PC1","PC2","PC3","PC4","PC5","PC6","PC7"],height=pca.explained_variance_ratio_)
plt.show()

[0.17252592 0.10348626 0.07992296 0.05450644 0.02552347 0.01401692
0.00590595]
[0.37843932 0.22699934 0.17531275 0.11956106 0.05598628 0.03074642
0.01295483]


### Note that T = XW, where T is the new coordinate system, X is the old coordinate system and W is the weight matrix where the column vectors are the eigen vectors of the matrix X'X¶

In [94]:
weights = pca.components_
weights = weights.transpose()
T = data_norm.dot(weights[:,:2])

In [95]:
print(T)

[[-0.73739712  0.83146526]
[-0.08237654  0.33704322]
[-0.9240241   0.45278157]
...
[-0.90648213  0.58205567]
[-0.9455266   0.69657884]
[-0.03365671  0.90558889]]

In [96]:
plt.scatter(T[intervention==0,0],T[intervention==0,1])
plt.scatter(T[intervention==1,0],T[intervention==1,1])
plt.xlabel("Customer Satisfaction")
plt.ylabel("Customer Value")
plt.show()