# Generate simulated data

import numpy as np
import matplotlib.pyplot as plt

n = 10
noise_strength = 1
np.random.seed(147)
noise = np.random.normal(scale=noise_strength, size=(n,1))
x = np.random.normal(size=(n,1))
y = 1 + 2*x + noise


# Put simulated x and y into a dataframe

import pandas as pd
df = pd.DataFrame(columns = ['x','y'])
df['x'] = x.ravel()
df['y'] = y.ravel()
df


# assign a value of 0.2 to df.loc[4,'y'] 

df.loc[4,'y'] = 0.2
df


# Generating "the" four items 
#-- use the visualization trick in class to understand what they are

idx_test = [1,7]
Xtest = df.loc[idx_test,'x']
ytest = df.loc[idx_test,'y']
Xtrain = df.drop(idx_test)['x']
ytrain = df.drop(idx_test)['y']


Xtrain, ytrain
Xtest, ytest


# plot the train and test data with scatterplot

plt.scatter(Xtrain,ytrain, label='Training data', color='k')
plt.scatter(Xtest, ytest, label = 'Test data', color = 'blue')
#plt.title('Our randomly generated data')
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Fit with a normal linear regression model

from sklearn.linear_model import LinearRegression as LR, Ridge, Lasso
Xtrain = np.array(Xtrain).reshape(-1,1)
Xtest = np.array(Xtest).reshape(-1,1)
lin_reg = LR()
lin_reg.fit(Xtrain, ytrain)

# print train and test R^2 values
print(lin_reg.score(Xtrain, ytrain))
print(lin_reg.score(Xtest, ytest))


print(lin_reg.intercept_, lin_reg.coef_)


# Ridge regression

rid_reg = Ridge(alpha = 2)
rid_reg.fit(Xtrain, ytrain)

print(rid_reg.score(Xtrain, ytrain))
print(rid_reg.score(Xtest,ytest))


print(rid_reg.intercept_, rid_reg.coef_)


# Ridge regression with a larger alpha

rid_reg2 = Ridge(alpha = 100)
rid_reg2.fit(Xtrain, ytrain)

print(rid_reg2.score(Xtrain, ytrain))
print(rid_reg2.score(Xtest,ytest))


print(rid_reg2.intercept_, rid_reg2.coef_)


# Visualize the regular linear regression model and the two Ridge regression models

x_range = [min(x), max(x)]

# three y_ranges, one for each model
y_pred = lin_reg.predict(x_range)
y_ridge = rid_reg.predict(x_range)
y_ridge2 = rid_reg2.predict(x_range)

plt.figure(figsize = (10,6))
# plot Train and test data
plt.scatter(Xtrain.ravel(),ytrain, label='Train data', color='k')
plt.scatter(Xtest.ravel(), ytest, label = 'Test data', color = 'green', s = 100)
# plot the models
plt.plot(x_range, y_pred, label='Linear_regression', color='r')
plt.plot(x_range, y_ridge, label = 'Ridge_alpha_2', color = 'blue', linestyle = '--')
plt.plot(x_range, y_ridge2, label = 'Ridge_alpha_100', color = 'grey', linestyle = '-.')
#plt.title('')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()


# Lasso regression

las_reg = Lasso(alpha = 100)
las_reg.fit(Xtrain, ytrain)
print(las_reg.score(Xtrain, ytrain))
print(las_reg.score(Xtest,ytest))


#Use pockets data
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split as tts
# be sure to upload the intro_Data.py file so that it can be imported
from intro_Data import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


help(do_Kfold)

Help on function do_Kfold in module intro_Data:

do_Kfold(model, X, y, k, scaler=None, random_state=146)


pockets = pd.read_csv('./data/pockets.csv')
pockets.head()


# Use maxheightfront as the target and the other numeric variables as features

metadata = pockets[['brand','style','menWomen', 'cutout']]
X_df = pockets.drop(columns = list(metadata.columns)
                          + ['name', 'fabric', 'maxHeightFront'])
X = np.array(X_df.values)
y = np.array(pockets['maxHeightFront'].values).reshape(-1,1)


#Does this data need to be scaled? Not for OLS, but will need later
sns.stripplot(data = X_df, orient = 'h')

<Axes: >


help(do_Kfold)


# use do_Kfold function to get train/test scores
# try without rescaling first
lin_reg = LR()
train_res, test_res = do_Kfold(lin_reg, X,y,10)


np.shape(test_res)

(10,)


# visualize train/test scores by stripplot

import seaborn as sns
df = pd.DataFrame(zip(train_res,test_res), columns = ['Train','Test'])
sns.stripplot(data = df, size = 10, alpha = 0.8, palette = 'BuPu')
plt.axhline(np.mean(test_res), c = 'slategrey', linestyle = '--')
plt.show()


print(np.mean(train_res), np.median(train_res))
print(np.mean(test_res), np.median(test_res))

0.6481278808366254 0.6380743980948804
0.42150969601454563 0.618056921843455


from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler as SS


# search the best alpha value

a_range = np.linspace(10,20,100)
k = 10
ss = SS()

avg_tr_score=[]
avg_te_score=[]

for a in a_range:
    rid_reg = Ridge(alpha=a)
    train_scores, test_scores = do_Kfold(rid_reg, X, y, k, ss)
    
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))
    #print(a, np.mean(test_scores))


# plot the score-alpha curve

plt.figure(figsize=(8,8))
#plt.plot(a_range, avg_tr_score, color='k', label='Training')
plt.plot(a_range, avg_te_score, color='r', label='Testing')
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Avg. $R^2$', fontsize=14)
#plt.legend()
plt.show()


# Get the optimal hyperparameter alpha and its corresponding scores

idx_max = np.argmax(avg_te_score)

print('Optimal alpha in the range tested: ', a_range[idx_max])
print('Avg. training score at this value: ', avg_tr_score[idx_max])
print('Avg. testing score at this value: ', avg_te_score[idx_max])

Optimal alpha in the range tested:  12.02020202020202
Avg. training score at this value:  0.6344576593800286
Avg. testing score at this value:  0.44258643208744763


# fit an OLS model and get coefficients

Xs = ss.fit_transform(X)
lin_reg.fit(Xs,y)
lin_coefs = lin_reg.coef_[0]

lin_coefs

array([-0.35161376,  1.92356316,  1.4028077 , -0.36014987,  1.37946072,
        1.66568514,  0.17595748, -0.36056614, -1.21507708])


# fit a Ridge model and get coefficients

rid_reg = Ridge(alpha = a_range[idx_max])
rid_reg.fit(Xs, y)
rid_coefs = rid_reg.coef_[0]

rid_coefs

array([-0.27768022,  1.72217699,  1.14585851, -0.05468923,  1.04316768,
        1.15328581,  0.49688341, -0.36724293, -0.75164045])


# get the side-by-side comparison of coefficients

compare = pd.DataFrame(data=zip(lin_coefs,rid_coefs),index=X_df.columns,
                      columns = ['OLS','Ridge'])
compare


# get coefficients for a range of alpha values

a_range = np.linspace(0,1000,100)
rid_coefs = []
for a in a_range:
    rid_reg = Ridge(alpha=a)
    rid_reg.fit(Xs,y)
    rid_coefs.append(rid_reg.coef_[0])


# visualize coefficients-alpha relation (one line for each coefficient beta_i)

plt.figure(figsize=(12,6))
plt.plot(a_range, rid_coefs, linewidth = 2)
plt.axhline(0, linewidth = 4, c = 'black', linestyle = '--', alpha = 0.5)
plt.scatter([0]*len(lin_coefs), lin_coefs)
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Coefficients', fontsize=14)
plt.legend(X_df.columns, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()


from sklearn.linear_model import Lasso


# optimal alpha value search

a_range = np.linspace(1e-5,0.2,100)
k = 10
ss = SS()

avg_tr_score=[]
avg_te_score=[]

for a in a_range:
    las_reg = Lasso(alpha=a)
    train_scores, test_scores = do_Kfold(las_reg, X, y, k, ss)
    
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))


# plot score-alpha value curve

plt.figure(figsize=(8,8))
#plt.plot(a_range, avg_tr_score, color='k', label='Training')
plt.plot(a_range, avg_te_score, color='r', label='Testing')
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Avg. $R^2$', fontsize=14)
plt.legend()
plt.show()


# find the optimal alpha and its corresponding scores

idx_max = np.argmax(avg_te_score)

print('Optimal alpha in the range tested: ', a_range[idx_max])
print('Avg. training score at this value: ', avg_tr_score[idx_max])
print('Avg. testing score at this value: ', avg_te_score[idx_max])


# find the coefficients of the Lasso model with the optimal alpha

las_reg = Lasso(alpha = a_range[idx_max])
las_reg.fit(Xs, y)
las_coefs =las_reg.coef_
las_coefs


# add las_coefs as an additional column to compare

compare['Lasso'] = las_coefs
compare


# Get coefficents of Lasso models for a range of alpha values

a_range = np.linspace(1e-6,10,100)
las_coefs = []
for a in a_range:
    las_reg = Lasso(alpha=a)
    las_reg.fit(Xs,y)
    las_coefs.append(las_reg.coef_)


# visualize the coefficients-alpha relation

plt.figure(figsize=(12,6))
plt.plot(a_range, las_coefs)
plt.scatter([0]*len(lin_coefs), lin_coefs)
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Coefficients', fontsize=14)
plt.legend(X_df.columns, bbox_to_anchor=[1,0.5], loc='center left')
plt.show()


# what's the relation between coefficients and correlation?

X_numeric = X_df.copy()
X_numeric['target'] = y

X_numeric.corr()

	brand	style	menWomen	name	fabric	price	maxHeightFront	minHeightFront	rivetHeightFront	maxWidthFront	minWidthFront	maxHeightBack	minHeightBack	maxWidthBack	minWidthBack	cutout
0	Arizona	skinny	women	Fave Super Skinny	78% cotton, 20% polyester, 2% spandex	42.0	14.5	15.0	6.5	16.5	13.0	15.0	12.2	13.7	12.0	False
1	Arizona	straight	women	Perfect Boot	78% cotton, 20% polyester, 2% spandex	42.0	14.5	14.0	6.5	16.0	12.5	15.5	12.2	13.0	11.2	False
2	Ralph Lauren	skinny	women	Modern Skinny Slimming Fit	92% cotton, 7% poly, 1% elastane	89.5	13.0	13.5	6.5	14.5	12.0	15.5	13.0	13.5	12.5	False
3	Ralph Lauren	straight	women	Premier Straight Slimming Fit	92% cotton, 7% poly, 1% elastane	89.5	13.0	13.5	6.5	14.5	12.0	14.5	13.0	13.0	12.2	False
4	Uniqlo	skinny	women	Skinny Fit	87% cotton, 9% polyester, 4% spandex	39.9	13.0	13.0	5.5	14.0	11.5	14.0	12.0	14.2	11.7	False

	OLS	Ridge
price	-0.351614	-0.277680
minHeightFront	1.923563	1.722177
rivetHeightFront	1.402808	1.145859
maxWidthFront	-0.360150	-0.054689
minWidthFront	1.379461	1.043168
maxHeightBack	1.665685	1.153286
minHeightBack	0.175957	0.496883
maxWidthBack	-0.360566	-0.367243
minWidthBack	-1.215077	-0.751640

Regularization (Ridge and Lasso regression)¶

1. Regularization on simulated data¶

2. Regularization on real data¶

2.1 Regular linear regression¶

2.2 Ridge regression¶

Test Your Understanding:¶

Planning our approach¶

2.3 Lasso regression¶