import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Linear Regression
Load library
Define function
def train_linear_regression(X, y):
= np.ones(X.shape[0])
ones = np.column_stack([ones, X])
X
= X.T.dot(X)
XTX = np.linalg.inv(XTX)
XTX_inv = XTX_inv.dot(X.T).dot(y)
w_full
return w_full[0], w_full[1:]
def train_linear_regression_reg(X, y, r=0.001):
= np.ones(X.shape[0])
ones = np.column_stack([ones, X])
X
= X.T.dot(X)
XTX = XTX + r * np.eye(XTX.shape[0])
XTX
= np.linalg.inv(XTX)
XTX_inv = XTX_inv.dot(X.T).dot(y)
w_full
return w_full[0], w_full[1:]
def rmse(y, y_pred):
= y - y_pred
err = err ** 2
se = se.mean()
mse return np.sqrt(mse)
Data preparation
= pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
car car.head()
engine_displacement | num_cylinders | horsepower | vehicle_weight | acceleration | model_year | origin | fuel_type | drivetrain | num_doors | fuel_efficiency_mpg | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 170 | 3.0 | 159.0 | 3413.433759 | 17.7 | 2003 | Europe | Gasoline | All-wheel drive | 0.0 | 13.231729 |
1 | 130 | 5.0 | 97.0 | 3149.664934 | 17.8 | 2007 | USA | Gasoline | Front-wheel drive | 0.0 | 13.688217 |
2 | 170 | NaN | 78.0 | 3079.038997 | 15.1 | 2018 | Europe | Gasoline | Front-wheel drive | 0.0 | 14.246341 |
3 | 220 | 4.0 | NaN | 2542.392402 | 20.2 | 2009 | USA | Diesel | All-wheel drive | 2.0 | 16.912736 |
4 | 210 | 1.0 | 140.0 | 3460.870990 | 14.4 | 2009 | Europe | Gasoline | All-wheel drive | 2.0 | 12.488369 |
= ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
base = car[base].copy()
df df.head()
engine_displacement | horsepower | vehicle_weight | model_year | fuel_efficiency_mpg | |
---|---|---|---|---|---|
0 | 170 | 159.0 | 3413.433759 | 2003 | 13.231729 |
1 | 130 | 97.0 | 3149.664934 | 2007 | 13.688217 |
2 | 170 | 78.0 | 3079.038997 | 2018 | 14.246341 |
3 | 220 | NaN | 2542.392402 | 2009 | 16.912736 |
4 | 210 | 140.0 | 3460.870990 | 2009 | 12.488369 |
Distribution of fuel mpg
=50) sns.histplot(df.fuel_efficiency_mpg, bins
The distribution of the target variable fuel_efficiency_mpg
looks like the normal distribution so no need the log transformation.
Handling missing values
sum() df.isnull().
engine_displacement 0
horsepower 708
vehicle_weight 0
model_year 0
fuel_efficiency_mpg 0
dtype: int64
Column horsepower
has missing values.
Median of horsepower
= df.horsepower.median()
median_horsepower median_horsepower
np.float64(149.0)
Setup the framework
= len(df)
n = int(n * 0.2)
n_val = int(n * 0.2)
n_test = n - n_val - n_test
n_train + n_test + n_train n, n_val
(9704, 9704)
Data partitioning
Split the data in train/val/test sets, with 60%/20%/20% distribution.
n_val, n_test, n_train
(1940, 1940, 5824)
Data shuffling
Shuffle the dataset, I use seed 42
.
# let's shuffle the records with index
= np.arange(n)
idx 42)
np.random.seed(
np.random.shuffle(idx)
= df.iloc[idx[:n_train]]
df_train = df.iloc[idx[n_train:n_train+n_val]]
df_val = df.iloc[idx[n_train+n_val:]]
df_test
df_train.head()
engine_displacement | horsepower | vehicle_weight | model_year | fuel_efficiency_mpg | |
---|---|---|---|---|---|
483 | 220 | 144.0 | 2535.887591 | 2009 | 16.642943 |
7506 | 160 | 141.0 | 2741.170484 | 2019 | 16.298377 |
8795 | 230 | 155.0 | 2471.880237 | 2017 | 18.591822 |
1688 | 150 | 206.0 | 3748.164469 | 2015 | 11.818843 |
6217 | 300 | 111.0 | 2135.716359 | 2006 | 19.402209 |
len(df_train), len(df_val), len(df_test)
(5824, 1940, 1940)
Restating the index
= df_train.reset_index(drop=True)
df_train = df_val.reset_index(drop=True)
df_val = df_test.reset_index(drop=True)
df_test
= df_train.fuel_efficiency_mpg.values
y_train = df_val.fuel_efficiency_mpg.values
y_val = df_test.fuel_efficiency_mpg.values
y_test
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
y_train
array([16.64294342, 16.29837715, 18.59182197, ..., 17.59323665,
18.92574819, 17.96528447], shape=(5824,))
Baseline model to predict fuel_efficiency_mpg
= ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
cols def prepare_X_0(df):
= df.copy()
df
= df[cols]
df_num = df_num.fillna(0)
df_num = df_num.values
X return X
def prepare_X_mean(df):
= df.copy()
df
= df['horsepower'].mean()
horsepower_mean 'horsepower'] = df['horsepower'].fillna(horsepower_mean).values
df[
# df['age'] = 2020 - df.model_year
# features = cols + ['age']
= df[cols]
df_num = df_num.values
X return X
With 0
for missing values
= prepare_X_0(df_train)
X_train = train_linear_regression(X_train, y_train)
w0, w = w0 + X_train.dot(w)
y_pred = rmse(y_train, y_pred)
score_0 score_0
np.float64(0.5202614265099076)
With mean
for missing values
= prepare_X_mean(df_train)
X_train = train_linear_regression(X_train, y_train)
w0, w = w0 + X_train.dot(w)
y_pred = rmse(y_train, y_pred)
score_mean score_mean
np.float64(0.46244121379599645)
Which options gives better RMSE?
Improve model with validation data set
= prepare_X_0(df_train)
X_train = train_linear_regression(X_train, y_train)
w0, w
= prepare_X_0(df_val)
X_val = w0 + X_val.dot(w)
y_pred = rmse(y_val, y_pred).round(2)
score_0 score_0
np.float64(0.52)
= prepare_X_mean(df_train)
X_train = train_linear_regression(X_train, y_train)
w0, w
= prepare_X_mean(df_val)
X_val = w0 + X_val.dot(w)
y_pred = rmse(y_val, y_pred).round(2)
score_mean score_mean
np.float64(0.46)
The option with mean
for missing values gives better RMSE.
Best regularization parameter
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
= prepare_X_0(df_train)
X_train = train_linear_regression_reg(X_train, y_train, r=r)
w0, w
= prepare_X_0(df_val)
X_val = w0 + X_val.dot(w)
y_pred = rmse(y_val, y_pred).round(2)
score
print(r, w0, score)
0 28.827365474598718 0.52
0.01 24.926838421084422 0.52
0.1 11.239661085047366 0.52
1 1.7315979412398264 0.52
5 0.36380750207153073 0.52
10 0.18306246622476907 0.52
100 0.01841801730804525 0.52
The best regularization parameter is 0.01
.
RMSE Standard Deviation
= []
all_scores for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
np.random.seed(seed)
= np.arange(n)
idx
np.random.shuffle(idx)
= df.iloc[idx[:n_train]]
df_train = df.iloc[idx[n_train:n_train+n_val]]
df_val = df.iloc[idx[n_train+n_val:]]
df_test
= df_train.reset_index(drop=True)
df_train = df_val.reset_index(drop=True)
df_val = df_test.reset_index(drop=True)
df_test
= df_train.fuel_efficiency_mpg.values
y_train = df_val.fuel_efficiency_mpg.values
y_val = df_test.fuel_efficiency_mpg.values
y_test
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
= prepare_X_0(df_train)
X_train = train_linear_regression(X_train, y_train)
w0, w
= prepare_X_0(df_val)
X_val = w0 + X_val.dot(w)
y_pred = rmse(y_val, y_pred)
score
all_scores.append(score)
print(seed, w0, score)
0 27.376388746129475 0.5206531296297207
1 29.022468555840756 0.5213388912866506
2 25.904357611918527 0.5228069974913666
3 28.015450068187377 0.5159516741255491
4 25.93400040714799 0.5109129460116937
5 25.89950696140074 0.5283406460212935
6 25.91723093394924 0.5313910658190373
7 28.414324731359304 0.509067038739038
8 26.47841782776404 0.5147399129482789
9 27.513852582351465 0.513186590829269
round(3) np.std(all_scores).
np.float64(0.007)
Evaluation on test
9)
np.random.seed(
= np.arange(n)
idx
np.random.shuffle(idx)
= df.iloc[idx[:n_train]]
df_train = df.iloc[idx[n_train:n_train+n_val]]
df_val = df.iloc[idx[n_train+n_val:]]
df_test
= df_train.reset_index(drop=True)
df_train = df_val.reset_index(drop=True)
df_val = df_test.reset_index(drop=True)
df_test
= df_train.fuel_efficiency_mpg.values
y_train = df_val.fuel_efficiency_mpg.values
y_val = df_test.fuel_efficiency_mpg.values
y_test
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
= pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
df_full_train
= prepare_X_0(df_full_train)
X_full_train
= np.concatenate([y_train, y_val])
y_full_train = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
w0, w
= prepare_X_0(df_test)
X_test = w0 + X_test.dot(w)
y_pred rmse(y_test, y_pred)
np.float64(0.5156261299185628)