import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from intro_Data_4_2 import *


from sklearn.datasets import load_iris as li
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN


# Load data, split into feature/target

data = li()
X = data.data
X_names = data.feature_names

y = data.target
y_names = data.target_names

X_df = pd.DataFrame(X,columns=X_names)
X_df.head()

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


y_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')


# Make this a binary problem (virginia vs not virginica)
y_bin = np.array([1 if value == 2 else 0 for value in y])
y_bin_names = ['Not_virginica', 'Virginica']


y_bin

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


# Use one predictor (petal width)
col = 'petal width (cm)'
X = X_df[col]

X

0      0.2
1      0.2
2      0.2
3      0.2
4      0.2
      ... 
145    2.3
146    1.9
147    2.0
148    2.3
149    1.8
Name: petal width (cm), Length: 150, dtype: float64


# make a scatterplot
plt.scatter(X,y_bin,alpha=0.5, ec='k')
plt.xlabel(col)
plt.yticks([0,1], [y_bin_names[i] + ' (y=' + str(i) + ')' for i in [0,1]])
plt.show()


#Create and fit model
log_reg = LR()
log_reg.fit(X.values.reshape(-1,1), y_bin)

LogisticRegression()

LogisticRegression()


#Reminder of hard vs soft classifications
log_reg.predict(X[75:80].values.reshape(-1,1))

array([0, 0, 1, 0, 0])


log_reg.predict_proba(X[75:80].values.reshape(-1,1))

array([[0.75554163, 0.24445837],
       [0.75554163, 0.24445837],
       [0.45722097, 0.54277903],
       [0.66709636, 0.33290364],
       [0.94591643, 0.05408357]])


#Let's fit model to new (synthetic) data, petal widths from 0 to 3 (some code from Machine Learning book)
X_new = np.linspace(0,3,1000)
y_proba = log_reg.predict_proba(X_new.reshape(-1,1))


# Visualize the probability-feature curve
plt.figure(figsize = [12,4])
plt.plot(X_new, y_proba[:,1], 'r-', label = 'Virginica')
plt.plot(X_new, y_proba[:,0], 'b--', label = 'Not Virginica')
colors = ['blue','red']
plt.scatter(X,y_bin, ec = 'k', c = 'grey', label = 'Original data')
plt.ylabel('Probability', fontsize=14)
plt.xlabel('Petal width (cm)',fontsize=14)
plt.legend()
plt.show()


X_df


# Logistic regression with all four features

from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as SS

X = np.array(X_df)
Xtrain,Xtest,ytrain,ytest = tts(X,y_bin,test_size=0.4, random_state=146)

# Standardize the data
ss = SS()
Xtrain_s = ss.fit_transform(Xtrain)
Xtest_s = ss.transform(Xtest)

# Fit the model
log_reg.fit(Xtrain_s, ytrain)

# Generate predictions for the test set
y_pred = log_reg.predict(Xtest_s)

# Look at the confusion matrix 
#(compare_classes function is imported from intro_Data.py)
conf_mat, accuracy = compare_classes(ytest, y_pred, y_bin_names)
conf_mat

Accuracy = 0.95


log_reg.score(Xtest_s, ytest)

0.95


# Visualize misclassified data points

from sklearn.decomposition import PCA
colors = ['red', 'blue']
pca = PCA(n_components = 2)
Xtrain_pca = pca.fit_transform(Xtrain_s)
Xtest_pca = pca.transform(Xtest_s)

# Show the training data
fig,ax = plot_groups(Xtrain_pca, ytrain, colors, alpha = 0.2)
# Show the testing data
plot_groups(Xtest_pca, y_pred, colors, ax=ax, alpha = 1, s = 50)


# Highlight test points that were incorrectly labeled

idx_wrong = ytest != y_pred


plot_groups(Xtest_pca[idx_wrong], y_pred[idx_wrong], colors, ax=ax, alpha = 1, s = 250,
            legend_text = ['Not virginica', 'Virginica'], labels = ['PC1','PC2'])

print(idx_wrong)

plt.show()

[False False False False False False False False False False False False
 False False False  True  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])


# Logistic regression with all features and multiple classes

from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as SS

X = np.array(X_df)
Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, random_state=146)

# Standardize the data
ss = SS()
Xtrain_s = ss.fit_transform(Xtrain)
Xtest_s = ss.transform(Xtest)

# Fit the model
log_reg.fit(Xtrain_s, ytrain)

# Generate predictions for the test set
y_pred = log_reg.predict(Xtest_s)

# Look at the confusion matrix
conf_mat, accuracy = compare_classes(ytest, y_pred, y_names)
conf_mat

Accuracy = 0.95


log_reg.score(Xtest_s, ytest)

0.95


# Visualize misclassified data points

from sklearn.decomposition import PCA
# now there are three classes
colors = ['red', 'blue', 'green']
pca = PCA(n_components = 2)
Xtrain_pca = pca.fit_transform(Xtrain_s)
Xtest_pca = pca.transform(Xtest_s)

fig,ax = plot_groups(Xtrain_pca, ytrain, colors, alpha = 0.2)
plot_groups(Xtest_pca, y_pred, colors, ax=ax, alpha = 1, s = 50)
idx_wrong = ytest != y_pred
plot_groups(Xtest_pca[idx_wrong], y_pred[idx_wrong], colors, ax=ax, alpha = 1, s = 150,  
            legend_text = list(y_names), labels = ['PC1','PC2'])
plt.show()


plt.figure()
plt.title("LR decision boundary",fontsize=16)
plot_decision_boundaries(Xtrain_pca,ytrain,LR)
plt.xlabel("PCA-1",fontsize=15)
plt.ylabel("PCA-2",fontsize=15)
plt.show()


#Step 1 - get best cross-validated kNN model
neighbor_range = np.array(range(2,20))

train=[]
test=[]

k = 10

ss = SS()
X_ss = ss.fit_transform(X)

for n_neighbors in neighbor_range:
    knn = KNN(n_neighbors=n_neighbors, weights = 'distance')
    tr,te = do_Kfold(knn,X_ss,y,k) #SS()
    
    train.append(np.mean(tr))
    test.append(np.mean(te))


idx = test == np.max(test)
print('Optimal #neighbors: ', neighbor_range[idx])
print('Optimal test score: ', np.max(test))

Optimal #neighbors:  [ 8 10 11 12 13 14 15]
Optimal test score:  0.9666666666666666


#Step 2 - let's look at cross-val results for just one optimal n_neighbors value 
knn = KNN(n_neighbors = 8, weights = 'distance')
tr_knn, te_knn = do_Kfold(knn, X_ss,y,10)


#Step 3 - cross-validation on LR
tr_lr, te_lr = do_Kfold(LR(),X_ss,y,10)


np.mean(te_knn)

0.9666666666666666


np.mean(te_lr)

0.9533333333333334


X = np.array(X_df)
Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, random_state=146)

# Standardize the data
ss = SS()
Xtrain_s = ss.fit_transform(Xtrain)
Xtest_s = ss.transform(Xtest)

knn = KNN(n_neighbors=8, weights = 'distance')
knn.fit(Xtrain_s,ytrain)

plt.figure()
plt.title("KNN decision boundary with neighbors: 8",fontsize=16)
plot_decision_boundaries(Xtrain_pca,ytrain,KNN,n_neighbors=8)
plt.xlabel("PCA-1",fontsize=15)
plt.ylabel("PCA-2",fontsize=15)
plt.show()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

Logistic Regression¶

1. Two class problem with one predictor¶

2. Two class problem, all four predictors¶

3. All predictors, multiple class problem¶

4. Compare with kNN¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

Predicted	Not_virginica	Virginica
Actual
Not_virginica	38	3
Virginica	0	19

Predicted	setosa	versicolor	virginica
Actual
setosa	15	0	0
versicolor	0	23	3
virginica	0	0	19

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8