# Generate data

import numpy as np
import matplotlib.pyplot as plt

n = 10
np.random.seed(146)

# Create a list of values for our independent variable (or feature), x
x = np.random.normal(size=(n,1))

# Create our dependent variable (or target), using y=1+2x
y = 1 + 2*x

# Get a scatterplot of x and y
plt.scatter(x,y)
plt.title('Our randomly generated data')
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Fit the model

from sklearn.linear_model import LinearRegression as LR
lin_reg = LR()
lin_reg.fit(x,y)

LinearRegression()

LinearRegression()


# get coefficients 
lin_reg.coef_

array([[2.]])


# get intercept
lin_reg.intercept_

array([1.])


# predict y_pred from x

y_pred = lin_reg.predict(x)


# compare ground truth y and predicted y_pred
# What do you notice?
for i in range(len(y)):
    print(y[i], y_pred[i])

[5.79295713] [5.79295713]
[0.92379424] [0.92379424]
[-1.10704509] [-1.10704509]
[-0.76260764] [-0.76260764]
[2.37613321] [2.37613321]
[0.4317582] [0.4317582]
[-2.04866708] [-2.04866708]
[2.39370196] [2.39370196]
[0.93701067] [0.93701067]
[1.41461373] [1.41461373]


# Use the fitted model to make predictions for new x values

new_x = np.random.normal(size=(20,1))

y_pred = lin_reg.predict(new_x)

# Visualize original data
plt.scatter(x,y, label='Original data', color='k')
# Visualize new data with predicted y
plt.scatter(new_x, y_pred, label='Predicted values', color='r')
plt.title('Our randomly generated data')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

n

10


# Generate noisy data

# generate noise with normal distribution
noise_strength = 0.5
np.random.seed(147)
noise = np.random.normal(scale=noise_strength, size=(n,1))

# add noise to y
y = 1 + 2*x + noise

# visualize x and y with scatterplot
plt.scatter(x,y, label='Original data', color='k')
#plt.title('Our randomly generated data')
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Fit the model

lin_reg = LR()
lin_reg.fit(x,y)

print('Model coefficient: ', lin_reg.coef_[0][0])
print('Model intercept: ', lin_reg.intercept_[0])

Model coefficient:  2.2013300386209123
Model intercept:  1.1712556297781649


# compare ground truth y and predicted y_pred

y_pred = lin_reg.predict(x)

for i in range(len(y)):
    print(y[i], y_pred[i])

[6.49089735] [6.44669588]
[0.22415845] [1.08737861]
[-1.45024263] [-1.14789519]
[-0.85443967] [-0.76878495]
[3.74219601] [2.68591731]
[0.15072735] [0.54581176]
[-1.26707095] [-2.18430558]
[2.83504885] [2.70525462]
[0.93547602] [1.10192548]
[1.29285364] [1.62760646]


# Visualize the prediction

# generate x and y range (for the model)
x_range = [min(x), max(x)]
y_pred = lin_reg.predict(x_range)



plt.figure(figsize = (10,6))
plt.scatter(x,y, label='Original data', color='k')
#plot the model (red line)
plt.plot(x_range, y_pred, label='Model', color='r')
plt.title('Our randomly generated data')
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Visualize the errors with the squares -- the area of the small boxes represents squared error 

def VizSquaredErrors(x,y,model):
    # Function will plot x and y, show the best fit line as well as the squared errors, and return the raw error terms
    
    # Fit the model and plot the data
    model.fit(x,y)
    yp = model.predict(x)
    errors = abs(y - yp)
    plt.scatter(x,y,color='black',label='Actual Data')
    
    # Compute a range of x values to plot the model as a continuous line
    x_rng = np.linspace(min(x),max(x),20)
    y_pred = model.predict(x_rng)
    plt.plot(x_rng,y_pred,color='red',label='Fitted Model')
    
    # Draw squares at each data point indicating the squared error
    ax = plt.gca()
    for i,xi in enumerate(x):
        r = plt.Rectangle((xi, min(y[i],yp[i])),width=errors[i],height=errors[i],facecolor='blue',fill=True,alpha=0.1)
        ax.add_patch(r)
    plt.axis('equal')
    plt.xlabel('$x$')
    plt.ylabel('$y$')
    plt.legend()
    plt.show()
    return errors


VizSquaredErrors(x,y,lin_reg)

array([[0.04420147],
       [0.86322017],
       [0.30234744],
       [0.08565473],
       [1.0562787 ],
       [0.39508441],
       [0.91723463],
       [0.12979423],
       [0.16644946],
       [0.33475283]])


# Manually calculate MSE (less efficient way but show you what is the math behind)

#We recreate the model just in case we ran other code in between
lin_reg.fit(x, y)

print(np.shape(x), np.shape(y))

# calculate MSE based on the definition

y_pred = lin_reg.predict(x)

errors = y-y_pred

mse = np.mean(errors**2)
mse

(10, 1) (10, 1)

0.31156006909071987


# Calculate MSE using sklearn

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y, y_pred)
print(mse)

0.31156006909071987


# Get score

print('Model score: ', lin_reg.score(x,y))
print('Model MSE: ', mean_squared_error(lin_reg.predict(x),y))

Model score:  0.9449458126811453
Model MSE:  0.31156006909071987


# The coefficient of determination is the correlation coefficient squared

r = np.corrcoef(x,y,rowvar=False)[0][1]
print(r)
print(r**2)

0.9720832334122141
0.9449458126811452


# Generate data 

# uniformally generate 100 x values from [0,15]
n = 100
x = np.linspace(0,15,n)


# generate noise
noise_strength=0.5
np.random.seed(146)
noise = np.random.normal(scale=noise_strength,size=n)

# generate y
y = 1 + 2*np.sin(x) + noise

# scatterplot of x and y
plt.scatter(x,y)
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Fit a linear model
#What do you think of this?

lin_reg = LR()
lin_reg.fit(x.reshape(-1,1),y)

print(lin_reg.score(x.reshape(-1,1), y))

y_pred = lin_reg.predict(x.reshape(-1,1))

# scatterplot of data
plt.scatter(x,y, label='Data', color='k')
# scatterplot of the model
plt.scatter(x, y_pred, label='Model', color='r')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

0.0027690820574749475


# Transform data with np.sin()

z = np.sin(x)

# Visualize the transformed data z and y; they look linearly correlated now! 
plt.scatter(z,y)
plt.xlabel('$z=\sin(x)$')
plt.ylabel('y')
plt.show()


# Fit it again with lin_reg; See what is the input/feature now?

lin_reg.fit(z.reshape(-1,1),y)
lin_reg.score(z.reshape(-1,1),y)

0.8914793482862062


import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR

import pandas as pd
concrete = pd.read_excel('./data/Concrete_Data.xls')
concrete.head()


# Simplify column names

concrete.columns = [item.split('(')[0].rstrip().replace(' ','_') for item in concrete.columns]
concrete.head()


# Separate X and y into two dataframes

X = concrete.drop(columns = ['Concrete_compressive_strength'])
y = concrete['Concrete_compressive_strength'].values


# fit the model, get R^2

lin_reg = LR()
lin_reg.fit(X,y)
lin_reg.score(X,y)

0.6154647342687214


# get MSE

from sklearn.metrics import mean_squared_error
mean_squared_error(lin_reg.predict(X),y)

107.21180273450533


# showing you the math behind lin_reg.predict()

obs = np.array(X.iloc[0])
print('Observation:', obs, '\n')
print('Coefficients:', lin_reg.coef_, '\n')
print('Intercept:', lin_reg.intercept_, '\n')
sum(obs * lin_reg.coef_) + lin_reg.intercept_

Observation: [ 540.     0.     0.   162.     2.5 1040.   676.    28. ] 

Coefficients: [ 0.11978526  0.10384725  0.08794308 -0.1502979   0.29068694  0.01803018
  0.02015446  0.11422562] 

Intercept: -23.163755811078822

53.472859099719024


# same value as the above! why?
lin_reg.predict([obs])

/opt/conda/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

array([53.4728591])


# Visualize prediction -- perfect prediction means all dots are on the dashed diagonal line (y=x)

y_pred = lin_reg.predict(X)

plt.figure(figsize=(8,8))
plt.scatter(y, y_pred, alpha=0.5, ec='k')
# plotting the dashed diagnal
plt.plot([min(y), max(y)],[min(y),max(y)], '--k')
plt.axis('equal')
plt.xlabel('Actual Compressive Strength', fontsize=14)
plt.ylabel('Predicted Compressive Strength', fontsize=14)
plt.show()


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler as SS


#Do we need to scale the data???
#Go to slide?
import seaborn as sns
sns.stripplot(data = concrete, orient = 'h')

<Axes: >


# do standard scaling before PCA

ss = SS()
X_scaled = ss.fit_transform(X)

sns.stripplot(data = X_scaled, orient = 'h')

<Axes: >


# PCA transformation

pca = PCA()
pca_X = pca.fit_transform(X_scaled)


# think: what does each number mean? how is each number calculated? Do the same for the following 2 as well
pca.explained_variance_

array([2.28210403, 1.41758764, 1.3415455 , 1.01513995, 0.95252658,
       0.79089808, 0.17790128, 0.03007148])


pca.explained_variance_ratio_

array([0.28498605, 0.17702642, 0.16753038, 0.1267693 , 0.11895022,
       0.09876628, 0.02221607, 0.00375529])


np.cumsum(pca.explained_variance_ratio_)

array([0.28498605, 0.46201247, 0.62954284, 0.75631214, 0.87526237,
       0.97402864, 0.99624471, 1.        ])


# Fit the model after SS and PCA

lin_reg = LR()
X = pca_X[:,:6]

lin_reg.fit(X,y)
lin_reg.score(X,y)

0.5651274964358977


# Trend of R^2 score by taking different number of PCs
# what are the 8 printed values? Which one is the same as the above and why?

for i in range(1,len(pca.explained_variance_ratio_)+1):
    lin_reg = LR()
    X = pca_X[:,:i]
    lin_reg.fit(X,y)
    #print(mean_squared_error(lin_reg.predict(X),y))
    print(lin_reg.score(X,y))

0.00229144068487952
0.03236818339746361
0.3333775331909373
0.33337758099668935
0.4407687937872554
0.5651274964358977
0.6005314246794745
0.6154647342687214

	Cement (component 1)(kg in a m^3 mixture)	Blast Furnace Slag (component 2)(kg in a m^3 mixture)	Water (component 4)(kg in a m^3 mixture)	Superplasticizer (component 5)(kg in a m^3 mixture)	Coarse Aggregate (component 6)(kg in a m^3 mixture)	Fine Aggregate (component 7)(kg in a m^3 mixture)	Age (day)	Concrete compressive strength(MPa, megapascals)
0	540.0	0.0	162.0	2.5	1040.0	676.0	28	79.986111
1	540.0	0.0	162.0	2.5	1055.0	676.0	28	61.887366
2	332.5	142.5	228.0	0.0	932.0	594.0	270	40.269535
3	332.5	142.5	228.0	0.0	932.0	594.0	365	41.052780
4	198.6	132.4	192.0	0.0	978.4	825.5	360	44.296075

	Cement	Blast_Furnace_Slag	Water	Superplasticizer	Coarse_Aggregate	Fine_Aggregate	Age	Concrete_compressive_strength
0	540.0	0.0	162.0	2.5	1040.0	676.0	28	79.986111
1	540.0	0.0	162.0	2.5	1055.0	676.0	28	61.887366
2	332.5	142.5	228.0	0.0	932.0	594.0	270	40.269535
3	332.5	142.5	228.0	0.0	932.0	594.0	365	41.052780
4	198.6	132.4	192.0	0.0	978.4	825.5	360	44.296075

Linear regression¶

1. Simple Example with Simulated Data¶

1.1 Generate data¶

1.2 Fit the model¶

1.3 Observe the model¶

2. An example with some random noise¶

3. Scoring the model¶

4. Non-linear models¶

5. OLS on real data (Multiple linear regression)¶

5.1 Fit & score the model¶

5.2 The effect of pre-processing¶