from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def readFileThroughPandas(filename):
att = pd.read_csv(filename, usecols = np.arange(1,21))
lab = pd.read_csv(filename, usecols = [0])
return(att,lab)
(att,lab) = readFileThroughPandas("multicommodity.csv")
print(att.shape)
print(lab.shape)
# Use the first 300 rows for training and the remaining rows for testing
x_train = att.iloc[0:300]
y_train = lab.iloc[0:300]
x_test = att.iloc[300:]
y_test = lab.iloc[300:]
# Alternatively use the following code to choose the rows randomly
# x_train, x_test, y_train, y_test = train_test_split(att, lab, test_size = 0.30)
(500, 20) (500, 1)
# Note that only 60% of the dataset is being used for training
clf = tree.DecisionTreeClassifier()
clf.fit(X=x_train,y=y_train)
DecisionTreeClassifier()
# returns accuracy
print("Training accuracy",clf.score(X=x_train,y=y_train))
print("Testing accuracy",clf.score(X=x_test,y=y_test))
y_predicted = clf.predict(x_test)
Training accuracy 1.0 Testing accuracy 0.69
m = metrics.confusion_matrix(y_test,y_predicted,labels=clf.classes_)
print(m)
[[95 21] [41 43]]
# Better visualization of a confusion matrix
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=m,display_labels=clf.classes_)
disp.plot()
plt.show()