import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from intro_Data_4_3 import *
import seaborn as sns


from sklearn.datasets import load_iris as li


# Load data and split into features and targets

iris = li()
X = iris.data
X_names = iris.feature_names
y = iris.target
y_names = iris.target_names

X_df = pd.DataFrame(X,columns=X_names)
X_df.head()


# Taking two features from X
cols = ['petal width (cm)', 'sepal length (cm)']
X1 = np.array(X_df[cols])


from sklearn.model_selection import train_test_split as tts
Xtrain,Xtest,ytrain,ytest = tts(X1, y, test_size=0.4, random_state=146)


# Try playing around with the hyperparas to see how this changes the results

from sklearn.tree import DecisionTreeClassifier as DTC

dtc = DTC(random_state=146, min_samples_split = 5)
dtc.fit(Xtrain,ytrain)
y_pred = dtc.predict(Xtest)

# Train scores (overfitting?)
print("Train accuracy: ", dtc.score(Xtrain,ytrain))

conf_matrix, accuracy = compare_classes(ytest,y_pred,y_names)
conf_matrix

Train accuracy:  0.9666666666666667
Test accuracy = 0.95


# Visualize the decision tree

from sklearn import tree
plt.figure(figsize=(12,12))
tree.plot_tree(dtc,filled=True, feature_names = cols)
plt.savefig('IrisDecisionTree_Example.svg', bbox_inches='tight')
plt.show()


# Plot the classification regions along with the test data

x_range = np.linspace(X1[:,0].min(), X1[:,0].max(), 60)
y_range = np.linspace(X1[:,1].min(), X1[:,1].max(), 60)


# A function that generates grids
points = make_grid(x_range,y_range)



y_regions = dtc.predict(points)



# Need some colors for our plot
front_colors = ['red', 'blue', 'yellow']
back_colors = ['magenta', 'cyan', 'orange']

# plot grids 
fig,ax = plot_groups(points,y_regions, back_colors,ec='None',s=50, alpha = 0.1)
# plot train data
#plot_groups(Xtrain,ytrain,front_colors,ax=ax, s = 60, alpha = 0.5) 
# plot test data
plot_groups(Xtest,ytest,front_colors,ax=ax, s = 60, alpha = 1) 

plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.legend(y_names, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()


plt.figure()
plt.title("DTC, min_samples_split=5",fontsize=16)
plot_decision_boundaries(Xtrain,ytrain,DTC,min_samples_split = 5)
plt.xlabel("petal width [cm]",fontsize=15)
plt.ylabel("sepal length [cm]",fontsize=15)
for yi in np.unique(ytest):
        idx = ytest == yi
        plt.scatter(Xtest[idx,0], Xtest[idx,1],color = front_colors[yi], alpha = 1, s = 60, ec = 'k')
plt.show()


# Get train/test scores/accuracy for a range of max_depth values

depth_range = np.arange(2,10)
k = 10

train = []
test = []

for d in depth_range:
    dtc = DTC(max_depth = d, random_state=146)
    tr,te  = do_Kfold(dtc,X,y,k)
    train.append(np.mean(tr))
    test.append(np.mean(te))


# plot accuracy-max_depth curve

plt.plot(depth_range, train, '-xk', label='Training')
plt.plot(depth_range, test, '-xr', label='Testing')
plt.xlabel('Max Depth')
plt.ylabel('Classification Accuracy')
plt.legend()
plt.show()


# get optimal max_depth value
idx = test == np.max(test)
print('Optimal #neighbors: ', depth_range[idx])
print('Optimal test score: ', np.max(test))

Optimal #neighbors:  [4 5 6 7 8 9]
Optimal test score:  0.9400000000000001


# get mean train/test scores with optimal max_depth using K fold cross-validation
tr,te = do_Kfold(DTC(random_state = 146, max_depth = 4),X,y,10)
print(np.mean(tr), np.mean(te))

0.9903703703703703 0.9400000000000001


# visualize train/test scores with stripplot 
plt.figure(figsize = (6,6))
sns.stripplot(data = pd.DataFrame(zip(tr,te), columns = ['Training', 'Testing']), size=10, palette = 'winter',
             alpha = 0.5, jitter = True)
plt.show()


# Train a Random Forest classifier

from sklearn.ensemble import RandomForestClassifier as RFC

cols = ['petal width (cm)', 'sepal length (cm)']
X1 = X_df[cols]

Xtrain,Xtest,ytrain,ytest = tts(X1.values,y,test_size=0.4, random_state = 146)

rfc = RFC(random_state=146, n_estimators=100, max_depth = 4)
rfc.fit(Xtrain,ytrain)
y_pred = rfc.predict(Xtest)


# Plot the classification regions along with the test data

x_range = np.linspace(X1.values[:,0].min(), X1.values[:,0].max(), 60)
y_range = np.linspace(X1.values[:,1].min(), X1.values[:,1].max(), 60)

points = make_grid(x_range,y_range)

y_regions = rfc.predict(points)

# Need some colors for our plot
front_colors = ['red', 'blue', 'yellow']
back_colors = ['magenta', 'cyan', 'orange']

fig,ax = plot_groups(points,y_regions, back_colors, ec='None',s=50, alpha = 0.2)
#plot_groups(Xtest,ytest,front_colors,ax=ax, s = 60, alpha = 0.9)
plot_groups(Xtest,y_pred,front_colors,ax=ax, s = 60, alpha = 0.9)

plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.legend(y_names, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()


# Get the confusion matrix
conf_matrix, accuracy = compare_classes(ytest, y_pred, y_names)
conf_matrix

Test accuracy = 0.93


# An alternative way using the confusion_matrix function from sklearn
from sklearn.metrics import confusion_matrix as CM
conf_matrix2 = CM(ytest, y_pred)
conf_matrix2

array([[15,  0,  0],
       [ 0, 24,  2],
       [ 0,  2, 17]])


# Find optimal max_depth for RFC

depth_range = np.arange(2,10)
k = 10

train = []
test = []

for d in depth_range:
    rfc = RFC(n_estimators = 100, max_depth = d, random_state=146)
    tr,te  = do_Kfold(rfc,X,y,k)
    train.append(np.mean(tr))
    test.append(np.mean(te))


# Plot the accuracy-max_depth curve

plt.plot(depth_range, train, '-xk', label='Training')
plt.plot(depth_range, test, '-xr', label='Testing')
plt.xlabel('Max Depth')
plt.ylabel('Classification Accuracy')
plt.legend()
plt.show()


Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, random_state = 146)


dtc = DTC(random_state = 146, max_depth = 4)
dtc.fit(Xtrain, ytrain)
print("Train accuracy is: ", dtc.score(Xtrain,ytrain))
conf_matrix, accuracy = compare_classes(ytest, dtc.predict(Xtest), y_names)
conf_matrix

Train accuracy is:  1.0
Test accuracy = 0.93


rfc = RFC(random_state = 146, n_estimators = 100, max_depth = 5)
rfc.fit(Xtrain, ytrain)
print("Train accuracy is: ", rfc.score(Xtrain,ytrain))
conf_matrix, accuracy = compare_classes(ytest, dtc.predict(Xtest), y_names)
conf_matrix

Train accuracy is:  1.0
Test accuracy = 0.93


# Grid search for RFC

from sklearn.model_selection import KFold, GridSearchCV

# create a grid of parameter values
param_grid_rfc = dict(n_estimators=[1,100,500],max_depth = [2,3,4,5],
                 min_samples_split = [2,3,4,5]) 
# specify how you want to do the cross-validation
cv = KFold(n_splits=10, random_state=146, shuffle = True)
grid_rfc = GridSearchCV(RFC(random_state = 146), param_grid=param_grid_rfc, cv=cv, 
                    scoring='accuracy')

# perform grid search (what is this really doing?)
grid_rfc.fit(X, y)


print(
    f'The best parameters are {grid_rfc.best_params_} with a score of {grid_rfc.best_score_:.2f}'
)


results = pd.DataFrame(grid_rfc.cv_results_)[['param_n_estimators','param_max_depth',
                                'param_min_samples_split','mean_test_score','rank_test_score']]


results.head()


results[results['rank_test_score'] == 1]


# Grid search for DTC
param_grid_dtc = dict(max_depth = [2,3,4,5],
                 min_samples_split = [2,3,4,5]) #params to test
cv = KFold(n_splits=10, random_state=146, shuffle = True)
grid_dtc = GridSearchCV(DTC(random_state = 146), param_grid=param_grid_dtc, cv=cv, 
                    scoring='accuracy')
grid_dtc.fit(X, y)


print(
    f'The best parameters are {grid_dtc.best_params_} with a score of {grid_dtc.best_score_:.2f}'
)


results = pd.DataFrame(grid_dtc.cv_results_)[['param_max_depth',
                                'param_min_samples_split','mean_test_score','rank_test_score']]


results.head()


results[results['rank_test_score'] == 1]

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Predicted	setosa	versicolor	virginica
Actual
setosa	15	0	0
versicolor	0	23	3
virginica	0	0	19

Predicted	setosa	versicolor	virginica
Actual
setosa	15	0	0
versicolor	0	24	2
virginica	0	2	17

Predicted	setosa	versicolor	virginica
Actual
setosa	15	0	0
versicolor	0	24	2
virginica	0	2	17

Predicted	setosa	versicolor	virginica
Actual
setosa	15	0	0
versicolor	0	24	2
virginica	0	2	17

Decision Trees and Random Forests¶

1. DTC with a two-features example (iris data)¶

2. DTC with all features and hyperparameter selection¶

3. Random Forests¶

4. Grid search¶

Grid search for RFC¶

What about DTC?¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2