import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#fabricated data to make a point
x = np.random.normal(size = 100)
y = x*2+5


np.corrcoef(x,y)

array([[1., 1.],
       [1., 1.]])


plt.scatter(x,y)
plt.show()


z = x**2 + 5


np.corrcoef(x,z)

array([[ 1.        , -0.02110416],
       [-0.02110416,  1.        ]])


plt.scatter(x,z)
plt.show()


pd.options.display.max_rows=10 #sets the maximum number of rows and columns displayed when a frame is pretty-printed.
df = pd.read_csv('./data/gapminder.tsv', sep = '\t')
df.head(4)


data_2007 = df[df['year'] == 2007].dropna()


plt.figure(figsize=(4,4))
plt.scatter(data_2007['gdpPercap'],data_2007['lifeExp'])
plt.ylabel('Life Expectancy')
plt.xlabel('GDP per capita')
plt.show()


np.corrcoef(data_2007['lifeExp'], data_2007['gdpPercap'])

array([[1.       , 0.6786624],
       [0.6786624, 1.       ]])


data_2007['log_lifeExp']  = np.log10(data_2007['lifeExp'])
data_2007['log_gdpPercap'] = np.log10(data_2007['gdpPercap'])


# notice that this adds two columns
data_2007


plt.figure(figsize=(4,4))
plt.scatter(data_2007['log_lifeExp'], data_2007['log_gdpPercap'])
plt.xlabel('$\log_{10}$ Life Expectancy', fontsize=14)
plt.ylabel('$\log_{10}$ GDP per capita', fontsize=14)
plt.show()


np.corrcoef(data_2007['log_lifeExp'], data_2007['log_gdpPercap'])

array([[1.        , 0.78007058],
       [0.78007058, 1.        ]])


import seaborn as sns


sns.pairplot(data_2007[['lifeExp','pop','gdpPercap']])
plt.show()


cols = ['lifeExp', 'pop', 'gdpPercap', 'log_lifeExp', 'log_gdpPercap']
corr_matrix = np.corrcoef(data_2007[cols], rowvar=False) #indicate that each col is a variable


corr_matrix

array([[ 1.        ,  0.04755312,  0.6786624 ,  0.99650235,  0.80898025],
       [ 0.04755312,  1.        , -0.0556756 ,  0.05623144, -0.02353029],
       [ 0.6786624 , -0.0556756 ,  1.        ,  0.63856949,  0.87510881],
       [ 0.99650235,  0.05623144,  0.63856949,  1.        ,  0.78007058],
       [ 0.80898025, -0.02353029,  0.87510881,  0.78007058,  1.        ]])


#We could also do it this way - what's difference???
corr_matrix = data_2007[cols].corr()


corr_matrix


type(corr_matrix)

pandas.core.frame.DataFrame


sns.heatmap(corr_matrix, cmap='jet', vmin=-1, vmax=1,
            annot=True, fmt='.2f', xticklabels=cols, yticklabels=cols)
plt.show()


# upper triangle 
mask = np.triu(corr_matrix)


sns.heatmap(corr_matrix, cmap='jet', vmin=-1, vmax=1,
            annot=True, fmt='.2f', xticklabels=cols[:-1], yticklabels=['']+cols[1:],
           mask = mask)
plt.tick_params(size = 0, labelsize = 12)
plt.xticks(rotation = 90)
plt.show()


from sklearn.datasets import load_breast_cancer as lbc
data = lbc()


print(type(data))

<class 'sklearn.utils._bunch.Bunch'>


print(data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


X = data.data
X_names = data.feature_names
X_df = pd.DataFrame(X, columns=X_names)


X_df.head()


corr_matrix = X_df.corr()


mask = np.triu(corr_matrix)


plt.figure(figsize = [12,8])
sns.heatmap(corr_matrix, cmap='jet', vmin=-1, vmax=1, mask = mask)
plt.tick_params(size = 0, labelsize = 10)
plt.xticks(rotation = 90)
plt.show()

	country	continent	year	lifeExp	pop	gdpPercap
0	Afghanistan	Asia	1952	28.801	8425333	779.445314
1	Afghanistan	Asia	1957	30.332	9240934	820.853030
2	Afghanistan	Asia	1962	31.997	10267083	853.100710
3	Afghanistan	Asia	1967	34.020	11537966	836.197138

	country	continent	year	lifeExp	pop	gdpPercap	log_lifeExp	log_gdpPercap
11	Afghanistan	Asia	2007	43.828	31889923	974.580338	1.641752	2.988818
23	Albania	Europe	2007	76.423	3600523	5937.029526	1.883224	3.773569
35	Algeria	Africa	2007	72.301	33333216	6223.367465	1.859144	3.794025
47	Angola	Africa	2007	42.731	12420476	4797.231267	1.630743	3.680991
59	Argentina	Americas	2007	75.320	40301927	12779.379640	1.876910	4.106510
...	...	...	...	...	...	...	...	...
1655	Vietnam	Asia	2007	74.249	85262356	2441.576404	1.870691	3.387670
1667	West Bank and Gaza	Asia	2007	73.422	4018332	3025.349798	1.865826	3.480776
1679	Yemen, Rep.	Asia	2007	62.698	22211743	2280.769906	1.797254	3.358081
1691	Zambia	Africa	2007	42.384	11746035	1271.211593	1.627202	3.104218
1703	Zimbabwe	Africa	2007	43.487	12311143	469.709298	1.638359	2.671829

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

DATA 201¶

Exploring bivariate relationships¶

Linear correlation coefficients tell you about linear relationships¶

Correlations and scatterplots with real data¶

Matrices of scatterplots and correlations¶

A LOT more variables¶

	lifeExp	pop	gdpPercap	log_lifeExp	log_gdpPercap
lifeExp	1.000000	0.047553	0.678662	0.996502	0.808980
pop	0.047553	1.000000	-0.055676	0.056231	-0.023530
gdpPercap	0.678662	-0.055676	1.000000	0.638569	0.875109
log_lifeExp	0.996502	0.056231	0.638569	1.000000	0.780071
log_gdpPercap	0.808980	-0.023530	0.875109	0.780071	1.000000