import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from intro_Data_4_3 import *
import seaborn as sns
from sklearn.datasets import load_iris as li
# Load data and split into features and targets
iris = li()
X = iris.data
X_names = iris.feature_names
y = iris.target
y_names = iris.target_names
X_df = pd.DataFrame(X,columns=X_names)
X_df.head()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
# Taking two features from X
cols = ['petal width (cm)', 'sepal length (cm)']
X1 = np.array(X_df[cols])
Let's start with a single training/testing split and inspect the results, while varying some of the hyperparamters of the decision tree:
from sklearn.model_selection import train_test_split as tts
Xtrain,Xtest,ytrain,ytest = tts(X1, y, test_size=0.4, random_state=146)
There are a few hyperparameters for DTC --- they help control the size and complexity of the tree
min_samples_split (the minimum number of samples a node must have before it can be split)
min_samples_leaf (the minimum number of samples a leaf node must have)
min_weight_fraction_leaf (same as min_samples_leaf but expressed as a fraction of the total number of weighted instances --- To illustrate, suppose we have a dataset with 100 samples and min_weight_fraction_leaf is set to 0.1. In this case, each leaf node must contain at least 10% of the total sum of weights of the input samples, which translates to at least 10 samples in this example.)
max_leaf_nodes (the maximum number of leaf nodes), and
max_features (the maximum number of features that are evaluated for splitting at each node; for example, max_features=5 means that the algorithm will only consider 5 features (chosen randomly) at each split.).
# Try playing around with the hyperparas to see how this changes the results
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC(random_state=146, min_samples_split = 5)
dtc.fit(Xtrain,ytrain)
y_pred = dtc.predict(Xtest)
# Train scores (overfitting?)
print("Train accuracy: ", dtc.score(Xtrain,ytrain))
conf_matrix, accuracy = compare_classes(ytest,y_pred,y_names)
conf_matrix
Train accuracy: 0.9666666666666667 Test accuracy = 0.95
Predicted | setosa | versicolor | virginica |
---|---|---|---|
Actual | |||
setosa | 15 | 0 | 0 |
versicolor | 0 | 23 | 3 |
virginica | 0 | 0 | 19 |
We can also look at the tree itself. Let's check it out for different hyperparameter values.
samples counts how many training instances it applies to.
value tells you how many training instances of each class this node applies to.
gini measures its impurity. The Gini impurity of a node is the probability that a data point within the node would be mislabeled, if it were assigned a label at random with probabilities proportional to the frequency of the labels within the node.
# Visualize the decision tree
from sklearn import tree
plt.figure(figsize=(12,12))
tree.plot_tree(dtc,filled=True, feature_names = cols)
plt.savefig('IrisDecisionTree_Example.svg', bbox_inches='tight')
plt.show()
Let's look at the classification regions (and perhaps see how they change when we change the max depth)
# Plot the classification regions along with the test data
x_range = np.linspace(X1[:,0].min(), X1[:,0].max(), 60)
y_range = np.linspace(X1[:,1].min(), X1[:,1].max(), 60)
# A function that generates grids
points = make_grid(x_range,y_range)
y_regions = dtc.predict(points)
# Need some colors for our plot
front_colors = ['red', 'blue', 'yellow']
back_colors = ['magenta', 'cyan', 'orange']
# plot grids
fig,ax = plot_groups(points,y_regions, back_colors,ec='None',s=50, alpha = 0.1)
# plot train data
#plot_groups(Xtrain,ytrain,front_colors,ax=ax, s = 60, alpha = 0.5)
# plot test data
plot_groups(Xtest,ytest,front_colors,ax=ax, s = 60, alpha = 1)
plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.legend(y_names, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()
plt.figure()
plt.title("DTC, min_samples_split=5",fontsize=16)
plot_decision_boundaries(Xtrain,ytrain,DTC,min_samples_split = 5)
plt.xlabel("petal width [cm]",fontsize=15)
plt.ylabel("sepal length [cm]",fontsize=15)
for yi in np.unique(ytest):
idx = ytest == yi
plt.scatter(Xtest[idx,0], Xtest[idx,1],color = front_colors[yi], alpha = 1, s = 60, ec = 'k')
plt.show()
# Get train/test scores/accuracy for a range of max_depth values
depth_range = np.arange(2,10)
k = 10
train = []
test = []
for d in depth_range:
dtc = DTC(max_depth = d, random_state=146)
tr,te = do_Kfold(dtc,X,y,k)
train.append(np.mean(tr))
test.append(np.mean(te))
# plot accuracy-max_depth curve
plt.plot(depth_range, train, '-xk', label='Training')
plt.plot(depth_range, test, '-xr', label='Testing')
plt.xlabel('Max Depth')
plt.ylabel('Classification Accuracy')
plt.legend()
plt.show()
In practice, you could probably improve performance overall by modifying multiple hyperparameters (we will talk about an easier way to do that later).
# get optimal max_depth value
idx = test == np.max(test)
print('Optimal #neighbors: ', depth_range[idx])
print('Optimal test score: ', np.max(test))
Optimal #neighbors: [4 5 6 7 8 9] Optimal test score: 0.9400000000000001
# get mean train/test scores with optimal max_depth using K fold cross-validation
tr,te = do_Kfold(DTC(random_state = 146, max_depth = 4),X,y,10)
print(np.mean(tr), np.mean(te))
0.9903703703703703 0.9400000000000001
# visualize train/test scores with stripplot
plt.figure(figsize = (6,6))
sns.stripplot(data = pd.DataFrame(zip(tr,te), columns = ['Training', 'Testing']), size=10, palette = 'winter',
alpha = 0.5, jitter = True)
plt.show()
Let's take a look with just two predictors so that we can compare the results from using a Random Forest versus a single decision tree.
# Train a Random Forest classifier
from sklearn.ensemble import RandomForestClassifier as RFC
cols = ['petal width (cm)', 'sepal length (cm)']
X1 = X_df[cols]
Xtrain,Xtest,ytrain,ytest = tts(X1.values,y,test_size=0.4, random_state = 146)
rfc = RFC(random_state=146, n_estimators=100, max_depth = 4)
rfc.fit(Xtrain,ytrain)
y_pred = rfc.predict(Xtest)
# Plot the classification regions along with the test data
x_range = np.linspace(X1.values[:,0].min(), X1.values[:,0].max(), 60)
y_range = np.linspace(X1.values[:,1].min(), X1.values[:,1].max(), 60)
points = make_grid(x_range,y_range)
y_regions = rfc.predict(points)
# Need some colors for our plot
front_colors = ['red', 'blue', 'yellow']
back_colors = ['magenta', 'cyan', 'orange']
fig,ax = plot_groups(points,y_regions, back_colors, ec='None',s=50, alpha = 0.2)
#plot_groups(Xtest,ytest,front_colors,ax=ax, s = 60, alpha = 0.9)
plot_groups(Xtest,y_pred,front_colors,ax=ax, s = 60, alpha = 0.9)
plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.legend(y_names, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()
What's different about the classification regions?
# Get the confusion matrix
conf_matrix, accuracy = compare_classes(ytest, y_pred, y_names)
conf_matrix
Test accuracy = 0.93
Predicted | setosa | versicolor | virginica |
---|---|---|---|
Actual | |||
setosa | 15 | 0 | 0 |
versicolor | 0 | 24 | 2 |
virginica | 0 | 2 | 17 |
# An alternative way using the confusion_matrix function from sklearn
from sklearn.metrics import confusion_matrix as CM
conf_matrix2 = CM(ytest, y_pred)
conf_matrix2
array([[15, 0, 0], [ 0, 24, 2], [ 0, 2, 17]])
The optimal hyperparameters for a decision tree may not be optimal for the Random Forest. Let's use all of the data and try to optimize max_depth for 100 trees.
# Find optimal max_depth for RFC
depth_range = np.arange(2,10)
k = 10
train = []
test = []
for d in depth_range:
rfc = RFC(n_estimators = 100, max_depth = d, random_state=146)
tr,te = do_Kfold(rfc,X,y,k)
train.append(np.mean(tr))
test.append(np.mean(te))
# Plot the accuracy-max_depth curve
plt.plot(depth_range, train, '-xk', label='Training')
plt.plot(depth_range, test, '-xr', label='Testing')
plt.xlabel('Max Depth')
plt.ylabel('Classification Accuracy')
plt.legend()
plt.show()
Let's compare DTC with RFC on 1 tts with all predictors:
Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, random_state = 146)
dtc = DTC(random_state = 146, max_depth = 4)
dtc.fit(Xtrain, ytrain)
print("Train accuracy is: ", dtc.score(Xtrain,ytrain))
conf_matrix, accuracy = compare_classes(ytest, dtc.predict(Xtest), y_names)
conf_matrix
Train accuracy is: 1.0 Test accuracy = 0.93
Predicted | setosa | versicolor | virginica |
---|---|---|---|
Actual | |||
setosa | 15 | 0 | 0 |
versicolor | 0 | 24 | 2 |
virginica | 0 | 2 | 17 |
rfc = RFC(random_state = 146, n_estimators = 100, max_depth = 5)
rfc.fit(Xtrain, ytrain)
print("Train accuracy is: ", rfc.score(Xtrain,ytrain))
conf_matrix, accuracy = compare_classes(ytest, dtc.predict(Xtest), y_names)
conf_matrix
Train accuracy is: 1.0 Test accuracy = 0.93
Predicted | setosa | versicolor | virginica |
---|---|---|---|
Actual | |||
setosa | 15 | 0 | 0 |
versicolor | 0 | 24 | 2 |
virginica | 0 | 2 | 17 |
Given a set of hyperparameters, how would you decide which is the best combination of hyperparameters?
Grid search is a basic method to do it. The key idea is to define a grid of hyperparameter values, use cross-validation to evaluate each of the combination, and then get the best combination from the validation scores (accuracy in the context of classification).
For other methods of hyperparameter tuning, refer to this link.
# Grid search for RFC
from sklearn.model_selection import KFold, GridSearchCV
# create a grid of parameter values
param_grid_rfc = dict(n_estimators=[1,100,500],max_depth = [2,3,4,5],
min_samples_split = [2,3,4,5])
# specify how you want to do the cross-validation
cv = KFold(n_splits=10, random_state=146, shuffle = True)
grid_rfc = GridSearchCV(RFC(random_state = 146), param_grid=param_grid_rfc, cv=cv,
scoring='accuracy')
# perform grid search (what is this really doing?)
grid_rfc.fit(X, y)
print(
f'The best parameters are {grid_rfc.best_params_} with a score of {grid_rfc.best_score_:.2f}'
)
results = pd.DataFrame(grid_rfc.cv_results_)[['param_n_estimators','param_max_depth',
'param_min_samples_split','mean_test_score','rank_test_score']]
results.head()
results[results['rank_test_score'] == 1]
# Grid search for DTC
param_grid_dtc = dict(max_depth = [2,3,4,5],
min_samples_split = [2,3,4,5]) #params to test
cv = KFold(n_splits=10, random_state=146, shuffle = True)
grid_dtc = GridSearchCV(DTC(random_state = 146), param_grid=param_grid_dtc, cv=cv,
scoring='accuracy')
grid_dtc.fit(X, y)
print(
f'The best parameters are {grid_dtc.best_params_} with a score of {grid_dtc.best_score_:.2f}'
)
results = pd.DataFrame(grid_dtc.cv_results_)[['param_max_depth',
'param_min_samples_split','mean_test_score','rank_test_score']]
results.head()
results[results['rank_test_score'] == 1]