Code examples for Data Preprocessing

9. Code examples for Data Preprocessing#

import numpy as np

from IPython.display import Image
%matplotlib inline

9.1. Dealing with missing data#

Identifying missing values in tabular data

import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

# StringIO simulates a file=like object in memory,
#like if it was a regular CSV file to read from the hard drive
df = pd.read_csv(StringIO(csv_data))

df

	A	B	C	D
0	1.0	2.0	3.0	4.0
1	5.0	6.0	NaN	8.0
2	10.0	11.0	12.0	NaN

df.isnull()

	A	B	C	D
0	False	False	False	False
1	False	False	True	False
2	False	False	False	True

df.isnull().sum(axis=0)

	0
A	0
B	0
C	1
D	1

dtype: int64

data = pd.Series([1, np.nan, 3, None, 5])
print(data.isnull())

  False
   True
  False
   True
  False
dtype: bool

# access the underlying NumPy array
# via the `values` attribute
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

data = np.array([1, np.nan, 3, None], dtype=float)
data

print(np.isnan(data))

[False  True False  True]

Eliminating training examples or features with missing values

df

	A	B	C	D
0	1.0	2.0	3.0	4.0
1	5.0	6.0	NaN	8.0
2	10.0	11.0	12.0	NaN

# remove rows that contain missing values

df.dropna(axis=0)

	A	B	C	D
0	1.0	2.0	3.0	4.0

# remove columns that contain missing values

df.dropna(axis=1)

	A	B
0	1.0	2.0
1	5.0	6.0
2	10.0	11.0

# only drop rows where all columns are NaN

df.dropna(how='all')

	A	B	C	D
0	1.0	2.0	3.0	4.0
1	5.0	6.0	NaN	8.0
2	10.0	11.0	12.0	NaN

# drop rows that have fewer than 4 real values

df.dropna(thresh=4)

	A	B	C	D
0	1.0	2.0	3.0	4.0

# only drop rows where NaN appear in specific columns (here: 'C')

df.dropna(subset=['C'])

	A	B	C	D
0	1.0	2.0	3.0	4.0
2	10.0	11.0	12.0	NaN

9.2. Imputing missing values#

# again: our original array
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

# impute missing values via the column mean

from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean') #other methods: 'median', 'most_frequent'
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

df.fillna(df.mean())

	A	B	C	D
0	1.0	2.0	3.0	4.0
1	5.0	6.0	7.5	8.0
2	10.0	11.0	12.0	6.0

df

	A	B	C	D
0	1.0	2.0	3.0	4.0
1	5.0	6.0	NaN	8.0
2	10.0	11.0	12.0	NaN

dg = df.copy()

row, col = (dg == 5.0).stack().idxmax()
print(row, col)

1 A

dg.loc[0,'A'] = None

dg

	A	B	C	D
0	NaN	2.0	3.0	4.0
1	5.0	6.0	NaN	8.0
2	10.0	11.0	12.0	NaN

9.3. Handling categorical data#

—– Nominal and ordinal features

import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

	color	size	price	classlabel
0	green	M	10.1	class2
1	red	L	13.5	class1
2	blue	XL	15.3	class2

dg = pd.DataFrame([['green','XL','class1'],['yellow','M','class1'],['red','L','class2'],['blue','S','class2']])
dg.columns = ['colors','size','label']
dg

	colors	size	label
0	green	XL	class1
1	yellow	M	class1
2	red	L	class2
3	blue	S	class2

mapping = {'XL':4,'L':3,'M':2,'S':1}

dg['size'] = dg['size'].map(mapping)

dg

	colors	size	label
0	green	4	class1
1	yellow	2	class1
2	red	3	class2
3	blue	1	class2

for k,s in mapping.items():
  print(k,s)

XL 4
L 3
M 2
S 1

inverse_mapping = {k:s for s,k in mapping.items()}
inverse_mapping

{4: 'XL', 3: 'L', 2: 'M', 1: 'S'}

dg['size'] = dg['size'].map(inverse_mapping)

dg

	colors	size	label
0	green	XL	class1
1	yellow	M	class1
2	red	L	class2
3	blue	S	class2

Mapping ordinal features

size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'] = df['size'].map(size_mapping)
df

	color	size	price	classlabel
0	green	1	10.1	class2
1	red	2	13.5	class1
2	blue	3	15.3	class2

inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

	size
0	M
1	L
2	XL

dtype: object

9.4. Encoding class labels#

import numpy as np

# create a mapping dict
# to convert class labels from strings to integers
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

# to convert class labels from strings to integers
df['classlabel'] = df['classlabel'].map(class_mapping)
df

	color	size	price	classlabel
0	green	1	10.1	1
1	red	2	13.5	0
2	blue	3	15.3	1

# reverse the class label mapping
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

	color	size	price	classlabel
0	green	1	10.1	class2
1	red	2	13.5	class1
2	blue	3	15.3	class2

df['classlabel']

	classlabel
0	class2
1	class1
2	class2

dtype: object

df['classlabel'].values

array(['class2', 'class1', 'class2'], dtype=object)

df

	color	size	price	classlabel
0	green	1	10.1	class2
1	red	2	13.5	class1
2	blue	3	15.3	class2

from sklearn.preprocessing import LabelEncoder

# Label encoding with sklearn's LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

# reverse mapping
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

9.5. Performing one-hot encoding on nominal features#

df

	color	size	price	classlabel
0	green	1	10.1	class2
1	red	2	13.5	class1
2	blue	3	15.3	class2

X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

np.shape(X[:, 0])

(3,)

np.shape(X[:, 0].reshape(-1, 1))

(3, 1)

from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

np.shape(color_ohe.fit_transform(X[:, 0].reshape(-1, 1)))

(3, 3)

np.shape(color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray())

(3, 3)

from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

dh = pd.DataFrame([['green',10.2, 1],['blue',8.2, 1],['yellow',9.8, 0]])
dh.columns = ['colors','price','label']
dh

	colors	price	label
0	green	10.2	1
1	blue	8.2	1
2	yellow	9.8	0

dh['colors']

	colors
0	green
1	blue
2	yellow

dtype: object

new_var = pd.get_dummies(dh[['colors','price','label']])

df

	color	size	price	classlabel
0	green	1	10.1	class2
1	red	2	13.5	class1
2	blue	3	15.3	class2

# one-hot encoding via pandas

pd.get_dummies(df[['price', 'color', 'size']]).astype(int)

	price	size	color_blue	color_green	color_red
0	10	1	0	1	0
1	13	2	0	0	1
2	15	3	1	0	0

# multicollinearity in get_dummies

pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

	price	size	color_green	color_red
0	10.1	1	True	False
1	13.5	2	False	True
2	15.3	3	False	False

# multicollinearity guard for the OneHotEncoder

color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

9.6. Partitioning a dataset into a separate training and test set#

df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]

	Class label	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test =\
    train_test_split(X, y,
                     test_size=0.3,
                     random_state=0,
                     stratify=y)

9.7. Bringing features onto the same scale#

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

A visual example:

ex = np.array([0, 1, 2, 3, 4, 5])

print('standardized:', (ex - ex.mean()) / ex.std())

# Please note that pandas uses ddof=1 (sample standard deviation)
# by default, whereas NumPy's std method and the StandardScaler
# uses ddof=0 (population standard deviation)

# normalize
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

standardized: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
normalized: [0.  0.2 0.4 0.6 0.8 1. ]

9.8. Selecting meaningful features#

—- Sparse solutions with L1-regularization

For regularized models in scikit-learn that support L1 regularization, we can simply set the penalty parameter to 'l1' to obtain a sparse solution:

from sklearn.linear_model import LogisticRegression

LogisticRegression(penalty='l1')

LogisticRegression(penalty='l1')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Applied to the standardized Wine data …

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')
# Note that C=1.0 is the default. You can increase
# or decrease it to make the regulariztion effect
# stronger or weaker, respectively.
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

Training accuracy: 1.0
Test accuracy: 1.0

lr.intercept_

array([-1.26352457, -1.21576785, -2.3711671 ])

np.set_printoptions(8)

lr.coef_[lr.coef_!=0].shape

(23,)

lr.coef_

array([[ 1.24574039,  0.18046984,  0.74484363, -1.16251576,  0.        ,
         0.        ,  1.16495206,  0.        ,  0.        ,  0.        ,
         0.        ,  0.55194572,  2.50968308],
       [-1.53727681, -0.38718475, -0.99524098,  0.36476702, -0.05956256,
         0.        ,  0.6679675 ,  0.        ,  0.        , -1.93383967,
         1.23411639,  0.        , -2.23183041],
       [ 0.13539093,  0.16972595,  0.35779691,  0.        ,  0.        ,
         0.        , -2.43329298,  0.        ,  0.        ,  1.56172158,
        -0.81756997, -0.49748177,  0.        ]])

np.shape(lr.coef_)

(3, 13)

lr.coef_[1]

array([-1.53727681, -0.38718475, -0.99524098,  0.36476702, -0.05956256,
        0.        ,  0.6679675 ,  0.        ,  0.        , -1.93383967,
        1.23411639,  0.        , -2.23183041])

import matplotlib.pyplot as plt

fig = plt.figure()
ax = plt.subplot(111)

colors = ['blue', 'green', 'red', 'cyan',
          'magenta', 'yellow', 'black',
          'pink', 'lightgreen', 'lightblue',
          'gray', 'indigo', 'orange']

weights, params = [], []
for c in np.arange(-4., 6.):
    lr = LogisticRegression(penalty='l1', C=10.**c, solver='liblinear',
                            multi_class='ovr', random_state=0)
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[1])
    params.append(10**c)

weights = np.array(weights)

print(np.shape(weights)) #dims of weights (10,13)

# you want to show how each weight (corresponding to a feature) changes vs regularization intensity

for column, color in zip(range(weights.shape[1]), colors):
    print(column,color)
    plt.plot(params, weights[:, column],
             label=df_wine.columns[column + 1],
             color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('Weight coefficient')
plt.xlabel('C (inverse regularization strength)')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',
          bbox_to_anchor=(1.38, 1.03),
          ncol=1, fancybox=True)

#plt.savefig('figures/04_08.png', dpi=300,
#            bbox_inches='tight', pad_inches=0.2)

plt.show()

(10, 13)
blue
green
red
cyan
magenta
yellow
black
pink
lightgreen
lightblue
gray
indigo
orange

_images/f15a63ac2b1c735a6fd4ee22f129439a0a87c8cf9dc681ad14d3161546f917f5.png

9.9. Assessing feature importance with Random Forests#

from sklearn.ensemble import RandomForestClassifier

feat_labels = df_wine.columns[1:]

forest = RandomForestClassifier(n_estimators=500,
                                random_state=1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[indices[f]],
                            importances[indices[f]]))

plt.title('Feature importance')
plt.bar(range(X_train.shape[1]),
        importances[indices],
        align='center')

plt.xticks(range(X_train.shape[1]),
           feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
# plt.savefig('figures/04_10.png', dpi=300)
plt.show()

 1) Proline                        0.185453
 2) Flavanoids                     0.174751
 3) Color intensity                0.143920
 4) OD280/OD315 of diluted wines   0.136162
 5) Alcohol                        0.118529
 6) Hue                            0.058739
 7) Total phenols                  0.050872
 8) Magnesium                      0.031357
 9) Malic acid                     0.025648
10) Proanthocyanins                0.025570
11) Alcalinity of ash              0.022366
12) Nonflavanoid phenols           0.013354
13) Ash                            0.013279

_images/5e1d7a35db1ce70d168f2514eb2dd49c955d06b0aa22cd3e824b4efbf6f0f41c.png

from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:',
      X_selected.shape[1])

Number of features that meet this threshold criterion: 5

Now, let’s print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes):

for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[indices[f]],
                            importances[indices[f]]))

 1) Proline                        0.185453
 2) Flavanoids                     0.174751
 3) Color intensity                0.143920
 4) OD280/OD315 of diluted wines   0.136162
 5) Alcohol                        0.118529

—- Example in Python of Feature Importance for Regression problems

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing

# Load sample data
data =  fetch_california_housing()
X, y = data.data, data.target

# Fit Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Print feature importance
feature_importance = model.feature_importances_
for i, imp in enumerate(feature_importance):
    print(f"Feature {data.feature_names[i]}: {imp}")

Feature MedInc: 0.5200367196529164
Feature HouseAge: 0.05296357881747684
Feature AveRooms: 0.04451309296326938
Feature AveBedrms: 0.029298856378707893
Feature Population: 0.03123174949895071
Feature AveOccup: 0.13640641507927073
Feature Latitude: 0.09285575343954347
Feature Longitude: 0.09269383416986465

Code examples for Data Preprocessing

Contents

9. Code examples for Data Preprocessing#

9.1. Dealing with missing data#

9.2. Imputing missing values#

9.3. Handling categorical data#

9.4. Encoding class labels#

9.5. Performing one-hot encoding on nominal features#

9.6. Partitioning a dataset into a separate training and test set#

9.7. Bringing features onto the same scale#

9.8. Selecting meaningful features#

9.9. Assessing feature importance with Random Forests#