import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from intro_Data_5 import *
Characterization of environmental microbial communities can involve large scale DNA sequencing. One task in the analysis pipeline can be binning sequences or contigs (larger sequences made by putting smaller sequences together) prior to further analysis. This can put sequences from the same organism together in a group so that they can be analyzed separately and can be used to find contaminants (like human DNA!). We may not know what the sequences are, i.e. they have no labels, but we can group them based on patterns of letters they contain. In this example, we look at 4 letter patterns, otherwise known as tetranucleotides.
data = pd.read_csv('data/DNA_data.csv')
data.head() # what is the dimension of the data?
AAAA | AAAC | AAAT | AAAG | AACA | AACC | AACT | AACG | AATA | AATC | ... | GGCT | GGCG | GGTA | GGTC | GGTT | GGTG | GGGA | GGGC | GGGT | GGGG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.003590 | 0.003675 | 0.004523 | 0.002541 | 0.005437 | 0.002962 | 0.003883 | 0.004101 | 0.005245 | 0.005914 | ... | 0.002811 | 0.003309 | 0.002535 | 0.003914 | 0.002962 | 0.004354 | 0.002910 | 0.001840 | 0.002707 | 0.001785 |
1 | 0.003195 | 0.003714 | 0.003834 | 0.002509 | 0.005379 | 0.003127 | 0.003652 | 0.004227 | 0.004291 | 0.006100 | ... | 0.002825 | 0.003506 | 0.002371 | 0.004111 | 0.003127 | 0.004668 | 0.003233 | 0.001972 | 0.002921 | 0.001995 |
2 | 0.003269 | 0.004386 | 0.005131 | 0.002317 | 0.006207 | 0.003393 | 0.003807 | 0.003434 | 0.006910 | 0.007365 | ... | 0.001945 | 0.002441 | 0.002317 | 0.003393 | 0.003393 | 0.004345 | 0.002897 | 0.001200 | 0.002938 | 0.002441 |
3 | 0.003142 | 0.004164 | 0.003300 | 0.001964 | 0.006442 | 0.002907 | 0.004085 | 0.004557 | 0.005264 | 0.006285 | ... | 0.002121 | 0.003614 | 0.001493 | 0.004399 | 0.002907 | 0.003849 | 0.002514 | 0.002043 | 0.002593 | 0.002121 |
4 | 0.004763 | 0.003297 | 0.006137 | 0.002473 | 0.005404 | 0.003114 | 0.003297 | 0.003252 | 0.007007 | 0.006183 | ... | 0.002473 | 0.002977 | 0.002107 | 0.003114 | 0.003114 | 0.004488 | 0.002244 | 0.002107 | 0.002610 | 0.002244 |
5 rows × 256 columns
from sklearn.preprocessing import StandardScaler as SS
X = SS().fit_transform(data.values)
X[:5,]
array([[-0.25025671, -0.02105438, 0.00543117, ..., -0.45244946, -0.36103047, -0.43115167], [-0.31300997, -0.00417599, -0.20408516, ..., -0.43093757, -0.26991905, -0.40852686], [-0.30133192, 0.29095164, 0.19010173, ..., -0.55657083, -0.26265326, -0.36064623], [-0.32144882, 0.1933219 , -0.36636496, ..., -0.41942503, -0.40989057, -0.39502326], [-0.06371466, -0.18690822, 0.49576222, ..., -0.40898695, -0.40222468, -0.38182204]])
from sklearn.decomposition import PCA
pca = PCA()
pca_props = pca.fit_transform(X)
plt.figure(figsize= [8,8])
plt.scatter(pca_props[:,0], pca_props[:,1], alpha = 0.5, ec = 'k')
plt.xlabel('$PC_1$', fontsize = 12)
plt.ylabel('$PC_2$', fontsize = 12)
plt.show()
pca.explained_variance_ratio_[:25]
array([0.32331585, 0.12804683, 0.05403234, 0.02978684, 0.0272901 , 0.02565754, 0.02115854, 0.01913828, 0.01638842, 0.01430024, 0.01374585, 0.01369477, 0.01253893, 0.01193362, 0.01156239, 0.01066115, 0.00983257, 0.00940089, 0.00897706, 0.00894302, 0.00806395, 0.00750297, 0.00708545, 0.00695983, 0.00665264])
np.cumsum(pca.explained_variance_ratio_[:24])
array([0.32331585, 0.45136269, 0.50539503, 0.53518187, 0.56247197, 0.58812951, 0.60928805, 0.62842633, 0.64481475, 0.65911499, 0.67286085, 0.68655562, 0.69909455, 0.71102817, 0.72259057, 0.73325171, 0.74308428, 0.75248518, 0.76146223, 0.77040525, 0.7784692 , 0.78597217, 0.79305762, 0.80001746])
evr = pca.explained_variance_ratio_[:20]
plt.figure(figsize=(8,6))
plt.bar(range(1, len(evr)+1), evr)
plt.xticks(range(1, len(evr)+1))
plt.grid(axis = 'y', linestyle = '--')
plt.xlabel('Principal components', fontsize=14)
plt.ylabel('Explained variance ratio', fontsize=14)
plt.show()
How many PCs do we need to use if we were to maintain at least 75% of the variances explained?
We will use these PCs as input to tSNE. Why?
from sklearn.manifold import TSNE
tsne = TSNE(random_state = 146)
tsne_props = tsne.fit_transform(pca_props[:,:24])
plt.figure(figsize = [8,8])
plt.scatter(tsne_props[:,0], tsne_props[:,1], alpha = 0.5, ec = 'k')
plt.xticks([])
plt.yticks([])
plt.show()
QUESTION
How does the tSNE plot differ from the PCA?
QUESTION
Let's use the first 18 PCs for clustering. Why might we just use 18 PCs for clustering instead of the original data?
from sklearn.cluster import KMeans
X = pca_props[:,:24]
k_range = np.arange(1,11)
scores = []
for k in k_range:
km = KMeans(n_clusters = k, random_state = 146, n_init=10)
km.fit(X)
scores.append(km.score(X))
plt.plot(k_range, scores, '-xk')
plt.xlabel('k')
plt.xticks(k_range)
plt.ylabel('-Within Cluster Variance')
plt.ylim(-600000,0)
plt.show()
# 1. repeat these N experiments (e.g., by changing random_states ... ) --> K_{i}
# 2. average K
# 3. uncertainty would be std(K)/ sqrt(N_{experiments})
QUESTION
Do the results from this plot agree with what you found for PCA/tSNE? Where is the `Elbow' of this curve?
km = KMeans(n_clusters = 3, random_state = 146)
groups = km.fit_predict(X)
print(groups)
[0 0 0 ... 2 2 2]
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
from intro_Data_5 import *
plot_groups(pca_props[:,:2],groups,colors = get_colors(3), labels = ['PC1','PC2'])
plt.show()
plot_groups(tsne_props,groups,colors = get_colors(3), labels = ['TSNE1','TSNE2'])
plt.show()
QUESTION
You should see something that looks strange to you when we plot the cluster membership on the tSNE plot. Why might this be occuring? What could it indicate about the original sequences in the data?
Now we will use the clustering information as a target to build a classifier for new sequences.
Let's try both a kNN classifier and a Random Forest.
# Grid search for KNN; Question: what did we do before?
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
param_grid = dict(n_neighbors = list(range(2,10)), weights = ['distance','uniform'])
cv = KFold(n_splits = 5,random_state = 146, shuffle = True)
grid = GridSearchCV(KNN(), param_grid = param_grid, cv = cv, scoring='accuracy')
grid.fit(X,groups)
# extract specific columns
results = pd.DataFrame(grid.cv_results_)[['params','mean_test_score','rank_test_score']]
results
QUESTION
Take a look at the results of the GridSearch. What do you notice in terms of hyperparameters - which tend to work best? Overall, how different are they from each other?
knn = KNN(n_neighbors = 2, weights = 'distance')
knn.fit(X,groups)
knn.score(X,groups) #returns the mean accuracy
# Use the fitted KNN to predict on new data
test_data = pd.read_csv('data/DNA_test.csv')
test_data.head()
# Q: why these steps?
knn_test_groups = knn.predict(pca.transform(SS().fit_transform(test_data.values))[:,:18])
knn_test_groups
fig,ax = plot_groups(X[:,:2],groups,colors = get_colors(3), alpha = 0.25)
plot_groups(pca.transform(SS().fit_transform(test_data.values))[:,:2], knn_test_groups, colors = get_colors(3),
s = 100, ec = 'k', ax = ax, alpha = 1, labels = ['PC1','PC2'])
plt.show()
# Grid search for RFC
from sklearn.ensemble import RandomForestClassifier as RFC
param_grid = dict(max_depth = [4, 6, 8], min_samples_split = [5, 10, 15])
cv = KFold(n_splits = 5,random_state = 146, shuffle = True)
grid = GridSearchCV(RFC(random_state = 146), param_grid = param_grid, cv = cv, scoring='accuracy')
grid.fit(X, groups)
results = pd.DataFrame(grid.cv_results_)[['param_max_depth','param_min_samples_split','mean_test_score','rank_test_score']]
# Get the best results
results[results['rank_test_score'] == 1]
#results
QUESTION
What do you make of the GridSearch results? Why might there so many with the same score? Why might we choose the parameters used to make the classifier below?
rfc = RFC(random_state = 146, n_estimators = 100, max_depth = 8, min_samples_split = 15)
rfc.fit(X, groups)
rfc_test_groups = rfc.predict(pca.transform(SS().fit_transform(test_data.values))[:,:18])
rfc_test_groups
fig,ax = plot_groups(X[:,:2],groups,colors = get_colors(3), alpha = 0.25)
plot_groups(pca.transform(SS().fit_transform(test_data.values))[:,:2], rfc_test_groups, colors = get_colors(3),
s = 100, ec = 'k', ax = ax, alpha = 1, labels = ['PC1','PC2'])
plt.show()
# What doest this mean?
conf_matrix, accuracy = compare_classes(knn_test_groups, rfc_test_groups)
conf_matrix
help(compare_classes)
QUESTION
What is the confusion matrix ACTUALLY showing? How would you re-label the rows and columns so they are more accurate?
#This is the actual full data set
#Let's cluster it and then compare with known classes
full_data = pd.read_csv('data/DNA_full_data.csv')
full_data.head()
pca_Full = pca.fit_transform(SS().fit_transform(full_data.values))
plt.scatter(pca_Full[:,0], pca_Full[:,1])
Let's go ahead and use 3 clusters.
km = KMeans(n_clusters = 3, random_state = 146, n_init=10)
km.fit(full_data.values)
full_predictions = km.predict(full_data.values)
full_predictions
#Let's just look at the data labels
labels = pd.read_csv('data/DNA_labels.csv', index_col = 0)
labels
conf_matrix, accuracy = compare_classes(labels['Sequence_ID'].values, full_predictions)
conf_matrix
why?
labels['Sequence_ID'].values
#This tells us which sequences should actually go together
mapping = {0:'Halo', 1:'Helico', 2:'Taq'}
new_full = pd.Series(full_predictions).apply(lambda x: mapping[x] ).values
new_full
conf_matrix, accuracy = compare_classes(labels['Sequence_ID'].values, new_full)
conf_matrix