# The objective is to find out the 7 different principal components and their explained variances. Principal component analysis is commonly used for data compression. It is possible to visualize the 7 dimensional data using the first two principal components in 2 dimensions with minimum loss of information. Plot the data in two dimensions and see if anything can be said about what kind of customers require intervention.¶

In [1]:
from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


### Importing datafile through pandas¶

In [2]:
def readFileThroughPandas(filename):
# Reads the entire data file

att = data[["Transactions made","Positive Reviews","Negative Reviews","Items Returned","Household Salary","Average Purchase Value","Family Size"]]
lab = data["Customers Requiring Intervention"]

# Let us do a normalization of our dataset because the attributes are of significantly different orders of magnitude
# Standard deviation based normalization
# data_norm=(att-att.mean())/att.std()
# Zero-to-One normalization
data_norm=(att-att.min())/(att.max()-att.min())
data_norm=data_norm.to_numpy()

return(data_norm,lab)


### Identifying the 7 principal components¶

In [3]:
data_norm,intervention = readFileThroughPandas("customerdata.csv")
print(np.shape(data_norm))

(1000, 7)


### Apply PCA¶

In [4]:
#Note that PCA does centering of the data
pca = PCA(n_components=7)
pca.fit(data_norm)

Out[4]:
PCA(n_components=7)

In [5]:
#Note that the first two components are similar to customer satisfaction and customer value, respectively
print(pca.components_)

[[-0.00328525  0.46597901 -0.86740711 -0.17306732  0.010641    0.00449094
0.01928677]
[ 0.47913522  0.03476629  0.04736982 -0.05098777  0.60271426  0.40369948
0.48799523]
[ 0.05886257  0.09253122 -0.14604654  0.9819914   0.0473299   0.00367415
-0.00910255]
[-0.27764024 -0.07724586 -0.03475359  0.04297846 -0.31695811 -0.23400424
0.87101871]
[-0.06694078  0.87558675  0.47199756 -0.00546035 -0.04651038 -0.04022935
0.04768279]
[ 0.44305793  0.00355406  0.0052064  -0.03397487  0.22910469 -0.86601517
-0.00586469]
[-0.69934851 -0.02147389 -0.00384734  0.0104686   0.69229152 -0.17502684
-0.02059577]]


### Printing the variances explained by each of the principal components¶

In [6]:
#Print the eigen values
print("Eigen values:",pca.explained_variance_)
#Print the proportion of variance represented by each component
print("Explained variance proportions:", pca.explained_variance_ratio_)
plt.bar(x=["PC1","PC2","PC3","PC4","PC5","PC6","PC7"],height=pca.explained_variance_ratio_)
plt.show()

Eigen values: [0.17252592 0.10348626 0.07992296 0.05450644 0.02552347 0.01401692
0.00590595]
Explained variance proportions: [0.37843932 0.22699934 0.17531275 0.11956106 0.05598628 0.03074642
0.01295483]


### Note that T = XW, where T is the new coordinate system, X is the old coordinate system and W is the weight matrix where the column vectors are the eigen vectors of the matrix X'X¶

In [7]:
#Extract 2 dimensions using PCA vectors
#Transform function transforms
data_pca_coordinates = pca.transform(data_norm)
#Select 2 columns from data_pca_coordinates
T = data_pca_coordinates[:,:2]

#Extract k dimensions using PCA vectors manually
#k=2
#weights = pca.components_
#weights = weights.transpose()
#T = data_norm.dot(weights[:,:k])

In [8]:
plt.scatter(T[intervention==0,0],T[intervention==0,1])
plt.scatter(T[intervention==1,0],T[intervention==1,1])
plt.xlabel("Customer Satisfaction")
plt.ylabel("Customer Value")
plt.show()