In Linear Discriminant Analysis, we want to classify individuals or observation units into one of \(G\) groups based on some vector of characetristics (\(X\)).

We will consider the problem of classifying emails into "spam" versus "non-spam" using discrimannt analysis. The approach is to use a training dataset to build a clssification rule, where we know which emails are spam and which are not, along with several characetristics of the email, such as frequencies of different words, special characters etc.

For this, we will use data vailable in R, originally prepared by Hewlett Packard. The data frame has 4601 observations and 58 variables. The data set contains 2788 e-mails classified as "nonspam" and 1813 classified as "spam".

library(kernlab)
library(MASS)
data(spam)
X<-spam
dim(X)
## [1] 4601   58
# Check Spam versus non spam in the data
table(X$type)
## 
## nonspam    spam 
##    2788    1813
#View(X)
#?spam

As a good practice, especially when we have larger data available, we could divide it into a "training set" and "validation set".

set.seed(sqrt(57))
# Marking records for training (approx 70%) or development 
dev<-(runif(dim(X)[1],0,1)<.70)*1 

Let us visualize the data with respect to 2 variables ("address" and "all"). We see that the two variables by themselves may not have much discriminatory power.

typ<-as.matrix(unique(X$type))
colors <- ifelse(X$type == typ[1], "red", "blue")
plot(X$all, X$address, col=colors, ylab="address", xlab="all")

Linear Discriminant Analysis (LDA)

We will first run LDA with only 3 variables.

# with 3 variables
fit <- lda(type ~ make+address+all,subset=which(dev==1), prior=c(.5,.5), data=X, na.action="na.omit",CV=FALSE)
fit # show results
## Call:
## lda(type ~ make + address + all, data = X, prior = c(0.5, 0.5), 
##     CV = FALSE, subset = which(dev == 1), na.action = "na.omit")
## 
## Prior probabilities of groups:
## nonspam    spam 
##     0.5     0.5 
## 
## Group means:
##               make   address       all
## nonspam 0.06657828 0.2603825 0.1973483
## spam    0.16004758 0.1570103 0.4218081
## 
## Coefficients of linear discriminants:
##                 LD1
## make     2.00791914
## address -0.08038354
## all      1.55900024
fit$class
## NULL

Fisher's linear discriminant scores

ldavar1<- as.matrix(X[,1:3])%*% (fit$scaling[,1])

## Fisher's linear discriminant coefficients (say L)
fit$scaling[,1]
##        make     address         all 
##  2.00791914 -0.08038354  1.55900024
## scores for the first 10 observations  (compute L.*X for each row X)
ldavar1[1:10]
##  [1] 0.9463147 1.1786557 1.2273653 0.0000000 0.0000000 0.0000000 0.0000000
##  [8] 0.0000000 1.0183280 1.3112593

Mahalonobis Method: group probabilities (based on flat prior)

pred<-predict(fit, data.frame(make=X$make, address=X$address, all=X$all) )

predclass<- as.matrix(pred[[1]])
predprob<- as.matrix(pred[[2]])
### predicted class for first 10 observations based on Mahalonobis method
predclass[1:10]
##  [1] "spam"    "spam"    "spam"    "nonspam" "nonspam" "nonspam" "nonspam"
##  [8] "nonspam" "spam"    "spam"
### predicted probability of class membership for first 10 observations based on Mahalonobis method
predprob[1:10,]
##      nonspam      spam
## 1  0.4655341 0.5344659
## 2  0.4341541 0.5658459
## 3  0.4276333 0.5723667
## 4  0.5935187 0.4064813
## 5  0.5935187 0.4064813
## 6  0.5935187 0.4064813
## 7  0.5935187 0.4064813
## 8  0.5935187 0.4064813
## 9  0.4557669 0.5442331
## 10 0.4164622 0.5835378

Confusion Matricies (LDA with 3 variables)

# confusion Matrix on development sample
# number of actual spam and non spam in development sample
table(X[dev==1,]$type)
## 
## nonspam    spam 
##    1961    1261
ct <- table(X$type[dev==1], predclass[dev==1])
## confusion matrix of frequencies (rows are actual)
ct
##          
##           nonspam spam
##   nonspam    1562  399
##   spam        594  667
## confusion matrix of proportions
prop.table(ct)
##          
##             nonspam      spam
##   nonspam 0.4847921 0.1238361
##   spam    0.1843575 0.2070143
## within class accuracy
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.7965324 0.5289453
# overall accuracy
sum(diag(prop.table(ct)))
## [1] 0.6918063

LDA (with all variables)

# Fishers method with all variables

fit <- lda(type ~ . ,subset=which(dev==1), prior=c(.5,.5), data=X, na.action="na.omit",CV=FALSE)


pred<-predict(fit, X[,-58] )

predclass<- as.matrix(pred[[1]])
confusion matrix on development sample
# confusion Matrix on development sample
ct <- table(X$type[dev==1], predclass[dev==1])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.9444161 0.8382236
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.9028554
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.57479826 0.03382992
##   spam    0.06331471 0.32805711
confusion matrix on validation sample
# confusion Matrix on validation sample
ct <- table(X$type[dev==0], predclass[dev==0])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.9407497 0.8478261
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.9035533
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.56417694 0.03553299
##   spam    0.06091371 0.33937636

Mahalonobis Method: group probabilities (based on informative prior)

Note that in LDA, Mahalonobis method with prior=(.5,.5) can be seen to be equivalent to Fisher's LDA. Suppose we use a prior that there is a 90% chance that an email is non-spam and 10% chance that it is spam.

fit <- lda(type ~ . ,subset=which(dev==1), prior=c(.9,.1), data=X, na.action="na.omit",CV=FALSE)

pred<-predict(fit, X[,-58] )

Confusion matrices Mahalanobis method (with all variables)

confusion matrix on development sample
# confusion Matrix on development sample
ct <- table(X$type[dev==1], predclass[dev==1])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.9444161 0.8382236
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.9028554
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.57479826 0.03382992
##   spam    0.06331471 0.32805711
confusion matrix on validation sample
# confusion Matrix on validation sample
ct <- table(X$type[dev==0], predclass[dev==0])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.9407497 0.8478261
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.9035533
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.56417694 0.03553299
##   spam    0.06091371 0.33937636

Quadratic Discriminant Analysis

fit <- qda(type ~ . ,subset=which(dev==1), prior=c(.7,.3), data=X, na.action="na.omit",CV=FALSE)


pred<-predict(fit, X[,-58] )

predclass<- as.matrix(pred[[1]])

# confusion Matrix on development sample
ct <- table(X$type[dev==1], predclass[dev==1])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.7572667 0.9555908
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.8348852
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.46089385 0.14773433
##   spam    0.01738051 0.37399131
# confusion Matrix on validation sample
ct <- table(X$type[dev==0], predclass[dev==0])
diag(prop.table(ct, 1))
##   nonspam      spam 
## 0.7666264 0.9492754
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.8397389
prop.table(ct)
##          
##              nonspam       spam
##   nonspam 0.45975344 0.13995649
##   spam    0.02030457 0.37998550