import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from intro_Data_4_1 import *

from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import StandardScaler as SS
from sklearn.model_selection import train_test_split as tts

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.datasets import make_moons, make_blobs as mb, load_breast_cancer as lbc, load_iris as li


# Create (isotropic Gaussian) blobs using the make_blobs function from sklearn

# How many blobs, total points, and dimensions (features)
n_groups = 3
n_points = 100
n_feats = 2

np.random.seed(146)

data = mb(n_samples = n_points, n_features=n_feats, centers = n_groups)
X = data[0]
y = data[1]

#print(np.shape(X))
#print(X)

#print(np.unique(y))

print(np.shape(X))
y

(100, 2)

array([2, 1, 1, 0, 0, 2, 0, 2, 0, 1, 2, 0, 2, 1, 0, 1, 1, 2, 1, 1, 1, 2,
       1, 1, 2, 0, 0, 1, 0, 1, 2, 2, 0, 1, 1, 2, 2, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1,
       0, 1, 1, 0, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 0, 0, 0, 2, 0,
       1, 1, 1, 0, 1, 2, 2, 2, 0, 1, 2, 0])


# Make a scatterplot of this data, color each group separately


# get the list of colors
colors = get_colors(n_groups)

#print(colors)


plt.figure(figsize = [6,6])

for yi in np.unique(y):
    # Plot the rows of data with the current value of yi
    idx = (y==yi)
    plt.scatter(X[idx,0], X[idx,1],color=colors[yi][0:5], ec='k',s=100, label = 'Group ' + str(yi))



# Create a new data point
random_point = np.random.random(size=n_feats)

#random_point[0] = 50


# Add this point to the plot
plt.scatter(random_point[0], random_point[1], color='grey', s=250, ec='r')

plt.legend()
plt.title('How should the gray point be classified?')
plt.axis('equal')
plt.show()


# KNN fit (what does this really do?)
knn = KNN(n_neighbors=5)
knn.fit(X,y)

KNeighborsClassifier()

KNeighborsClassifier()


# KNN predict label
knn.predict(random_point.reshape(1,-1))

array([0])


# predict probability of being each class
knn.predict_proba(random_point.reshape(1,-1))

array([[1., 0., 0.]])


# get neighbors; 
#first array: lengths to target point; second array: indices of target point
knn.kneighbors([random_point])

(array([[0.60837488, 0.68834077, 0.78450033, 1.52789423, 1.71988257]]),
 array([[91, 80, 51, 28, 96]]))


# get only indices of k-neighbors
neighbors = knn.kneighbors([random_point], return_distance = False)
neighbors

array([[91, 80, 51, 28, 96]])


# Make a scatterplot of the train/test data with colors, and highlight the neighbors of the test data 

# scatterplot of the train data with colors
fig, ax = plot_groups(X,y,colors,s = 60)
# plot neighbors of the test data with larger sizes
plot_groups(X[neighbors],y[neighbors],colors, s = 200, ax = ax)
# add the test data
ax.scatter(random_point[0], random_point[1], color='grey', s=150, ec='k')

plt.axis('equal')
plt.show()


# Load data, split into features/target

data = lbc()
X = data.data
X_names = data.feature_names
y = data.target
y_names = data.target_names


X.shape

(569, 30)

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])


# Make a stripplot (why do we do this?)

import seaborn as sns
plt.figure(figsize = [8,8])
sns.stripplot(data = pd.DataFrame(X,columns = X_names), orient = 'h')
plt.show()


# Find the optimal K

# range of K
neighbor_range = np.array(range(2,20))

print(neighbor_range)


train=[]
test=[]

# what is this?
k = 10

# Get train/test scores using K-fold cross validation 
# Looks familiar?
# What really is the score?
for n_neighbors in neighbor_range:
    knn = KNN(n_neighbors=n_neighbors)
    # be sure to put SS (why?)
    tr,te = do_Kfold(knn,X,y,k, SS())
    
    train.append(np.mean(tr))
    test.append(np.mean(te))

[ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


# Plot the score-K curve

plt.figure(figsize=(6,6))
plt.plot(neighbor_range, train, ':xk', label='Training')
plt.plot(neighbor_range, test, ':xr', label='Testing')
plt.ylabel('Mean accuracy', fontsize=14)
plt.xlabel('$k$',fontsize=14)
plt.xticks(neighbor_range)
plt.legend()
plt.show()


# obtain inex of the best K, and use it to get the optimal K
idx = test == np.max(test)
neighbor_range[idx]

array([ 9, 10])


# Choose any of the two
neighbors = 9
knn = KNN(n_neighbors=neighbors)


# Reminder that k is the number of folds
k = 10
tr,te = do_Kfold(knn,X,y,k, SS())


# Visualize train/test scores with scripplot

plt.figure(figsize = (6,6))
sns.stripplot(data = pd.DataFrame(zip(tr,te), columns = ['Training', 'Testing']), size=10, palette = 'winter',
             alpha = 0.5, jitter = True)
plt.show()


# Do a train/test split; take 20% of data as test
Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.2,random_state=146)


# re-scale
ss = SS()
Xtrain = ss.fit_transform(Xtrain)
Xtest = ss.transform(Xtest)


# refit the KNN model with the optimal #neighbors; predict y's for test data
knn = KNN(n_neighbors=neighbors)
knn.fit(Xtrain,ytrain)
y_pred = knn.predict(Xtest)


# get the indices of all test rows where prediction does not equal ground truth
idx_wrong = y_pred != ytest
idx_wrong

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False])


pca = PCA(n_components=2)
Xtrain_pca = pca.fit_transform(Xtrain)
Xtest_pca = pca.transform(Xtest)

# Get a list colors for the labels
colors = get_colors(len(np.unique(y)))

# Show the training data
fig,ax = plot_groups(Xtrain_pca, ytrain, colors, alpha = 0.1)
# Show the testing data
plot_groups(Xtest_pca, y_pred, colors, ax=ax, alpha=1, s=50)
# # Highlight test points that were incorrectly labeled
plot_groups(Xtest_pca[idx_wrong], y_pred[idx_wrong], colors, ax=ax, alpha=1, s=200, legend_text = ['malignant','benign'])
ax.set_xlabel('PC1',fontsize=14)
ax.set_ylabel('PC2',fontsize=14)
#ax.get_legend().remove()
plt.show()


# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html 
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ytest,y_pred)
cm

array([[41,  3],
       [ 1, 69]])


# We could also do it more conveniently
accuracy = knn.score(Xtest, ytest)
accuracy

0.9649122807017544


# Get another example of blobs data using the make_blobs function

# How many blobs, total points, and dimensions (features)
n_groups = 3
n_points = 100
n_feats = 2

# a different seed
np.random.seed(32)
data = mb(n_samples = n_points, n_features=n_feats, centers = n_groups)
X = data[0]
y = data[1]

colors = get_colors(n_groups)

# Make a plot of this data, color each group separately
fig, ax = plot_groups(X, y, colors,alpha=0.9,s=50, legend_text = ['Group 0','Group 1', 'Group 2'])

# Create a new data point
random_point = [2.67,7]
# Add this point to the plot
ax.scatter(random_point[0], random_point[1], color='grey', s=150, ec='k')

#plt.legend()
plt.title('How should the gray point be classified?')
plt.axis('equal')
plt.show()


# KNN without weights
knn = KNN(n_neighbors=5)
knn.fit(X,y)
knn.predict(np.array(random_point).reshape(1,-1))

array([2])


knn.predict_proba(np.array(random_point).reshape(1,-1))

array([[0. , 0.4, 0.6]])


# KNN with weights defined on distance
knn = KNN(n_neighbors=5, weights='distance')
knn.fit(X,y)
knn.predict(np.array(random_point).reshape(1,-1))

array([1])


knn.predict_proba(np.array(random_point).reshape(1,-1))

array([[0.        , 0.51046418, 0.48953582]])


#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

$K$-nearest neighbors (KNN)¶

1. Basic KNN example¶

2. Finding the optimal K¶

3. Visualize misclassified points¶

4. Should all votes be counted equally?¶