import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from intro_Data_5 import *


data = pd.read_csv('data/DNA_data.csv')


data.head() # what is the dimension of the data?


from sklearn.preprocessing import StandardScaler as SS
X = SS().fit_transform(data.values) 
X[:5,]

array([[-0.25025671, -0.02105438,  0.00543117, ..., -0.45244946,
        -0.36103047, -0.43115167],
       [-0.31300997, -0.00417599, -0.20408516, ..., -0.43093757,
        -0.26991905, -0.40852686],
       [-0.30133192,  0.29095164,  0.19010173, ..., -0.55657083,
        -0.26265326, -0.36064623],
       [-0.32144882,  0.1933219 , -0.36636496, ..., -0.41942503,
        -0.40989057, -0.39502326],
       [-0.06371466, -0.18690822,  0.49576222, ..., -0.40898695,
        -0.40222468, -0.38182204]])


from sklearn.decomposition import PCA 
pca = PCA()
pca_props = pca.fit_transform(X)


plt.figure(figsize= [8,8])
plt.scatter(pca_props[:,0], pca_props[:,1], alpha = 0.5, ec = 'k')
plt.xlabel('$PC_1$', fontsize = 12)
plt.ylabel('$PC_2$', fontsize = 12)

plt.show()


pca.explained_variance_ratio_[:25]

array([0.32331585, 0.12804683, 0.05403234, 0.02978684, 0.0272901 ,
       0.02565754, 0.02115854, 0.01913828, 0.01638842, 0.01430024,
       0.01374585, 0.01369477, 0.01253893, 0.01193362, 0.01156239,
       0.01066115, 0.00983257, 0.00940089, 0.00897706, 0.00894302,
       0.00806395, 0.00750297, 0.00708545, 0.00695983, 0.00665264])


np.cumsum(pca.explained_variance_ratio_[:24])

array([0.32331585, 0.45136269, 0.50539503, 0.53518187, 0.56247197,
       0.58812951, 0.60928805, 0.62842633, 0.64481475, 0.65911499,
       0.67286085, 0.68655562, 0.69909455, 0.71102817, 0.72259057,
       0.73325171, 0.74308428, 0.75248518, 0.76146223, 0.77040525,
       0.7784692 , 0.78597217, 0.79305762, 0.80001746])


evr = pca.explained_variance_ratio_[:20]

plt.figure(figsize=(8,6))
plt.bar(range(1, len(evr)+1), evr)
plt.xticks(range(1, len(evr)+1))
plt.grid(axis = 'y', linestyle = '--')
plt.xlabel('Principal components', fontsize=14)
plt.ylabel('Explained variance ratio', fontsize=14)
plt.show()


from sklearn.manifold import TSNE
tsne = TSNE(random_state = 146)
tsne_props = tsne.fit_transform(pca_props[:,:24])


plt.figure(figsize = [8,8])
plt.scatter(tsne_props[:,0], tsne_props[:,1], alpha = 0.5, ec = 'k')
plt.xticks([])
plt.yticks([])
plt.show()


from sklearn.cluster import KMeans


X = pca_props[:,:24]


k_range = np.arange(1,11)
scores = []
for k in k_range:
    km = KMeans(n_clusters = k, random_state = 146, n_init=10)
    km.fit(X)
    scores.append(km.score(X))


plt.plot(k_range, scores, '-xk')
plt.xlabel('k')
plt.xticks(k_range)
plt.ylabel('-Within Cluster Variance')
plt.ylim(-600000,0)
plt.show()


# 1. repeat these N experiments (e.g., by changing random_states ... ) --> K_{i}

# 2. average K 

# 3. uncertainty would be std(K)/ sqrt(N_{experiments})


km = KMeans(n_clusters = 3, random_state = 146)


groups = km.fit_predict(X)

print(groups)

[0 0 0 ... 2 2 2]

/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(


from intro_Data_5 import *

plot_groups(pca_props[:,:2],groups,colors = get_colors(3), labels = ['PC1','PC2'])
plt.show()


plot_groups(tsne_props,groups,colors = get_colors(3), labels = ['TSNE1','TSNE2'])
plt.show()


# Grid search for KNN; Question: what did we do before?

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN

param_grid = dict(n_neighbors = list(range(2,10)), weights = ['distance','uniform'])
cv = KFold(n_splits = 5,random_state = 146, shuffle = True)
grid = GridSearchCV(KNN(), param_grid = param_grid, cv = cv, scoring='accuracy')


grid.fit(X,groups)


# extract specific columns 
results = pd.DataFrame(grid.cv_results_)[['params','mean_test_score','rank_test_score']]


results


knn = KNN(n_neighbors = 2, weights = 'distance')
knn.fit(X,groups)


knn.score(X,groups) #returns the mean accuracy


# Use the fitted KNN to predict on new data

test_data = pd.read_csv('data/DNA_test.csv')
test_data.head()


# Q: why these steps?
knn_test_groups = knn.predict(pca.transform(SS().fit_transform(test_data.values))[:,:18])

knn_test_groups


fig,ax = plot_groups(X[:,:2],groups,colors = get_colors(3), alpha = 0.25)
plot_groups(pca.transform(SS().fit_transform(test_data.values))[:,:2], knn_test_groups, colors = get_colors(3),
            s = 100, ec = 'k', ax = ax, alpha = 1, labels = ['PC1','PC2'])

plt.show()


# Grid search for RFC

from sklearn.ensemble import RandomForestClassifier as RFC
param_grid = dict(max_depth = [4, 6, 8], min_samples_split = [5, 10, 15])
cv = KFold(n_splits = 5,random_state = 146, shuffle = True)
grid = GridSearchCV(RFC(random_state = 146), param_grid = param_grid, cv = cv, scoring='accuracy')


grid.fit(X, groups)


results = pd.DataFrame(grid.cv_results_)[['param_max_depth','param_min_samples_split','mean_test_score','rank_test_score']]


# Get the best results
results[results['rank_test_score'] == 1]
#results


rfc = RFC(random_state = 146, n_estimators = 100, max_depth = 8, min_samples_split = 15)


rfc.fit(X, groups)


rfc_test_groups = rfc.predict(pca.transform(SS().fit_transform(test_data.values))[:,:18])


rfc_test_groups


fig,ax = plot_groups(X[:,:2],groups,colors = get_colors(3), alpha = 0.25)
plot_groups(pca.transform(SS().fit_transform(test_data.values))[:,:2], rfc_test_groups, colors = get_colors(3),
            s = 100, ec = 'k', ax = ax, alpha = 1, labels = ['PC1','PC2'])
plt.show()


# What doest this mean? 

conf_matrix, accuracy = compare_classes(knn_test_groups, rfc_test_groups)
conf_matrix


help(compare_classes)


#This is the actual full data set
#Let's cluster it and then compare with known classes


full_data = pd.read_csv('data/DNA_full_data.csv')
full_data.head()


pca_Full = pca.fit_transform(SS().fit_transform(full_data.values))


plt.scatter(pca_Full[:,0], pca_Full[:,1])


km = KMeans(n_clusters = 3, random_state = 146, n_init=10)


km.fit(full_data.values)


full_predictions = km.predict(full_data.values)


full_predictions


#Let's just look at the data labels
labels = pd.read_csv('data/DNA_labels.csv', index_col = 0)
labels


conf_matrix, accuracy = compare_classes(labels['Sequence_ID'].values, full_predictions)
conf_matrix


labels['Sequence_ID'].values


#This tells us which sequences should actually go together
mapping = {0:'Halo', 1:'Helico', 2:'Taq'}
new_full = pd.Series(full_predictions).apply(lambda x: mapping[x] ).values
new_full


conf_matrix, accuracy = compare_classes(labels['Sequence_ID'].values, new_full)
conf_matrix

	AAAA	AAAC	AAAT	AAAG	AACA	AACC	AACT	AACG	AATA	AATC	...	GGCT	GGCG	GGTA	GGTC	GGTT	GGTG	GGGA	GGGC	GGGT	GGGG
0	0.003590	0.003675	0.004523	0.002541	0.005437	0.002962	0.003883	0.004101	0.005245	0.005914	...	0.002811	0.003309	0.002535	0.003914	0.002962	0.004354	0.002910	0.001840	0.002707	0.001785
1	0.003195	0.003714	0.003834	0.002509	0.005379	0.003127	0.003652	0.004227	0.004291	0.006100	...	0.002825	0.003506	0.002371	0.004111	0.003127	0.004668	0.003233	0.001972	0.002921	0.001995
2	0.003269	0.004386	0.005131	0.002317	0.006207	0.003393	0.003807	0.003434	0.006910	0.007365	...	0.001945	0.002441	0.002317	0.003393	0.003393	0.004345	0.002897	0.001200	0.002938	0.002441
3	0.003142	0.004164	0.003300	0.001964	0.006442	0.002907	0.004085	0.004557	0.005264	0.006285	...	0.002121	0.003614	0.001493	0.004399	0.002907	0.003849	0.002514	0.002043	0.002593	0.002121
4	0.004763	0.003297	0.006137	0.002473	0.005404	0.003114	0.003297	0.003252	0.007007	0.006183	...	0.002473	0.002977	0.002107	0.003114	0.003114	0.004488	0.002244	0.002107	0.002610	0.002244

1. Motivation¶

2. Unsupervised exploration¶

PCA and tSNE¶

k-means clustering¶

3. Building a classifier¶

knn¶

RFC¶

Behind the curtain¶