from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def readFileThroughPandas(filename):
# Reads the entire data file
data = pd.read_csv(filename)
att = data[["Transactions made","Positive Reviews","Negative Reviews","Items Returned","Household Salary","Average Purchase Value","Family Size"]]
lab = data["Customers Requiring Intervention"]
# Let us do a normalization of our dataset because the attributes are of significantly different orders of magnitude
# Standard deviation based normalization
# data_norm=(att-att.mean())/att.std()
# Zero-to-One normalization
data_norm=(att-att.min())/(att.max()-att.min())
data_norm=data_norm.to_numpy()
return(data_norm,lab)
data_norm,intervention = readFileThroughPandas("customerdata.csv")
print(np.shape(data_norm))
#Note that PCA does centering of the data
pca = PCA(n_components=7)
pca.fit(data_norm)
#Note that the first two components are similar to customer satisfaction and customer value, respectively
print(pca.components_)
#Print the eigen values
print("Eigen values:",pca.explained_variance_)
#Print the proportion of variance represented by each component
print("Explained variance proportions:", pca.explained_variance_ratio_)
plt.bar(x=["PC1","PC2","PC3","PC4","PC5","PC6","PC7"],height=pca.explained_variance_ratio_)
plt.show()
#Extract 2 dimensions using PCA vectors
#Transform function transforms
data_pca_coordinates = pca.transform(data_norm)
#Select 2 columns from data_pca_coordinates
T = data_pca_coordinates[:,:2]
#Extract k dimensions using PCA vectors manually
#k=2
#weights = pca.components_
#weights = weights.transpose()
#T = data_norm.dot(weights[:,:k])
plt.scatter(T[intervention==0,0],T[intervention==0,1])
plt.scatter(T[intervention==1,0],T[intervention==1,1])
plt.xlabel("Customer Satisfaction")
plt.ylabel("Customer Value")
plt.show()