import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


tiny = pd.read_csv('./data/small_data.csv')
tiny.head()


from sklearn.decomposition import PCA


pca = PCA()
tiny_pca = pca.fit_transform(tiny)


tiny_pca[0:10]

array([[ 6.14032156e+01,  3.16423093e+01, -4.43025543e-01],
       [ 2.01293002e+01, -1.68266900e+01,  2.60236216e-02],
       [ 1.65982819e+01,  2.08508471e+01,  9.93031235e-01],
       [-3.86598136e+01,  9.71022839e+00,  1.64838510e+00],
       [-6.02561592e+01, -1.11236474e+01,  3.75447287e+01],
       [-1.15706866e+02, -6.12707652e+00,  2.58627446e-01],
       [-2.25385121e+01, -1.46023649e+01,  1.61037712e+00],
       [ 2.00461034e+01, -1.76164975e+01, -3.63037932e+00],
       [-1.04829674e+02, -7.75746944e+00,  8.98138273e-02],
       [ 1.19699002e+02, -2.46314353e+01, -1.37415842e+00]])


plt.scatter(tiny_pca[:,0], tiny_pca[:,1])
plt.xlabel('$PC_1$', fontsize = 14)
plt.ylabel('$PC_2$', fontsize = 14)
plt.show()


tiny.var().sum()

5184.609191919192


tiny.cov()


tiny.corr()


sum(tiny.var())

5184.609191919192


pca.explained_variance_

array([4716.223442  ,  372.2014575 ,   96.18429243])


sum(pca.explained_variance_)

5184.609191919192


tiny_evr = pca.explained_variance_ratio_
print(tiny_evr)

[0.90965843 0.07178968 0.01855189]


plt.bar(range(1, len(tiny_evr)+1), tiny_evr*100, fill = True)
plt.ylabel('Percentage of total \nvariance explained', fontsize=14)
plt.xticks([1,2,3], ['PC1','PC2','PC3'])
plt.tick_params(labelsize = 12)
plt.grid(axis = 'y', linestyle = '--')
ax = plt.gca()
[ax.spines[i].set_visible(False) for i in ['top','left','right']]
plt.show()


work = pd.read_csv('./data/work_retention.csv')


work.head()


target = work['retain'] #y the target
labels = ['quit', 'stay'] #y_labels
work_numeric = work.drop(columns = ['retain']) #remove the target
work_numeric.head()


pca = PCA()
pca_work = pca.fit_transform(work_numeric)


pca_work[: 3, :]

array([[ 6.41016906e+03,  1.88117756e+01,  5.54637520e-01,
        -1.02340699e+00, -3.91375479e-01],
       [ 1.69161690e+04,  2.03414958e+01, -1.75288388e+00,
        -9.23623236e-01,  4.51683742e-02],
       [ 4.81301702e+04, -2.07291669e+00, -1.68803189e+00,
         1.47127633e-01,  1.58619801e-01]])


plt.scatter(pca_work[:,0], pca_work[:,1],alpha = 0.4)
plt.xlabel('$PC_1$', fontsize = 14)
plt.ylabel('$PC_2$', fontsize = 14)
plt.show()


y = work['retain']
labels = ['quit','stay']
colors = ['cornflowerblue', 'slategrey']
for yi in [0,1]:
    idx = y == yi
    plt.scatter(work.loc[idx,'years worked'], work.loc[idx,'commute'], color = colors[yi],
               label = labels[yi])
plt.legend()
plt.xlabel('years worked')
plt.ylabel('commute')
plt.show()


#What has changed?
y = work['retain']
labels = ['quit','stay']
colors = ['cornflowerblue', 'slategrey']
for yi in [0,1]:
    idx = y == yi
    plt.scatter(pca_work[idx,0], pca_work[idx,1], color = colors[yi],
               label = labels[yi])
plt.legend()
plt.show()


sns.stripplot(data = work_numeric)
plt.show()


for col in work_numeric.columns:
    print(col, work_numeric[col].var())
print('Total variance:', sum(work_numeric.var()))

age 144.64298994974874
children 1.0902261306532668
commute 0.12752271135608598
salary 656172323.7297488
years worked 1.8814741608040193
Total variance: 656172471.4719617


print(pca.explained_variance_ratio_)

[9.99999777e-01 2.17854776e-07 2.98936692e-09 1.65526575e-09
 7.25054818e-11]


evr = pca.explained_variance_ratio_
plt.figure(figsize=(8,6))
plt.bar(range(1, len(evr)+1), evr, fill = True)
plt.grid(axis = 'y', linestyle = '--')
plt.xlabel('Principal Components', fontsize=14)
plt.ylabel('Fraction of total variance explained', fontsize=14)
plt.show()


from sklearn.preprocessing import StandardScaler as SS


ss = SS()
work_scaled = pd.DataFrame(ss.fit_transform(work_numeric), columns = 
                          work_numeric.columns)
work_scaled.head()


pca_work_scaled = pca.fit_transform(work_scaled)


y = work['retain']
labels = ['quit','stay']
colors = ['cornflowerblue', 'slategrey']
for yi in [0,1]:
    idx = y == yi
    plt.scatter(pca_work_scaled[idx,0], pca_work_scaled[idx,1], color = colors[yi],
               label = labels[yi])
plt.legend()
plt.show()


print(pca.explained_variance_ratio_)

[0.35675069 0.22437386 0.19870812 0.17651962 0.04364771]


evr = pca.explained_variance_ratio_

plt.figure(figsize=(8,6))
plt.bar(range(1, len(evr)+1), evr, fill = True)
plt.grid(axis = 'y', linestyle = '--')
plt.xlabel('Principal Components', fontsize=14)
plt.ylabel('Fraction of total variance explained', fontsize=14)
plt.show()


from sklearn.manifold import TSNE


tsne = TSNE(random_state = 146)
#https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html


tsne_work = tsne.fit_transform(work_numeric)


y = work['retain']
labels = ['quit','stay']
colors = ['cornflowerblue', 'slategrey']
for yi in [0,1]:
    idx = y == yi
    plt.scatter(tsne_work[idx,0], tsne_work[idx,1], color = colors[yi],
               label = labels[yi])
plt.legend()
plt.show()


tsne_work = tsne.fit_transform(work_scaled)


y = work['retain']
labels = ['quit','stay']
colors = ['cornflowerblue', 'slategrey']
for yi in [0,1]:
    idx = y == yi
    plt.scatter(tsne_work[idx,0], tsne_work[idx,1], color = colors[yi],
               label = labels[yi])
plt.legend()
plt.show()

	a	b	c
a	1806.250909	1732.884646	-1371.780606
b	1732.884646	1859.835455	-1340.478384
c	-1371.780606	-1340.478384	1518.522828

	a	b	c
a	1.000000	0.945460	-0.828295
b	0.945460	1.000000	-0.797649
c	-0.828295	-0.797649	1.000000

	age	children	commute	salary	years worked	retain
0	63	0	0.095167	54233	2.18	1
1	64	0	0.044500	64739	4.53	1
2	40	1	0.159167	95953	4.57	1
3	58	5	0.091167	59038	2.99	1
4	63	0	0.016000	7958	4.36	1

	age	children	commute	salary	years worked
0	63	0	0.095167	54233	2.18
1	64	0	0.044500	64739	4.53
2	40	1	0.159167	95953	4.57
3	58	5	0.091167	59038	2.99
4	63	0	0.016000	7958	4.36

	age	children	commute	salary	years worked
0	1.540846	-0.974532	-0.792575	0.250870	-0.501449
1	1.624202	-0.974532	-0.934813	0.662036	1.216092
2	-0.376355	-0.014402	-0.612905	1.883636	1.245327
3	1.124063	3.826119	-0.803804	0.438920	0.090555
4	1.540846	-0.974532	-1.014822	-1.560161	1.091845

Principal Component Analysis¶

Effect of Variable Scales to PCA¶

tSNE (t-distributed stochastic neighbor embedding)¶

	a	b	c
0	119	135	-62
1	81	88	-82
2	90	102	-48
3	54	63	-29
4	62	17	-32