import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inlineLinear Regression
Load library
Define function
def train_linear_regression(X, y):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
def train_linear_regression_reg(X, y, r=0.001):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX = XTX + r * np.eye(XTX.shape[0])
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
def rmse(y, y_pred):
err = y - y_pred
se = err ** 2
mse = se.mean()
return np.sqrt(mse)Data preparation
car = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
car.head()| engine_displacement | num_cylinders | horsepower | vehicle_weight | acceleration | model_year | origin | fuel_type | drivetrain | num_doors | fuel_efficiency_mpg | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 170 | 3.0 | 159.0 | 3413.433759 | 17.7 | 2003 | Europe | Gasoline | All-wheel drive | 0.0 | 13.231729 |
| 1 | 130 | 5.0 | 97.0 | 3149.664934 | 17.8 | 2007 | USA | Gasoline | Front-wheel drive | 0.0 | 13.688217 |
| 2 | 170 | NaN | 78.0 | 3079.038997 | 15.1 | 2018 | Europe | Gasoline | Front-wheel drive | 0.0 | 14.246341 |
| 3 | 220 | 4.0 | NaN | 2542.392402 | 20.2 | 2009 | USA | Diesel | All-wheel drive | 2.0 | 16.912736 |
| 4 | 210 | 1.0 | 140.0 | 3460.870990 | 14.4 | 2009 | Europe | Gasoline | All-wheel drive | 2.0 | 12.488369 |
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = car[base].copy()
df.head()| engine_displacement | horsepower | vehicle_weight | model_year | fuel_efficiency_mpg | |
|---|---|---|---|---|---|
| 0 | 170 | 159.0 | 3413.433759 | 2003 | 13.231729 |
| 1 | 130 | 97.0 | 3149.664934 | 2007 | 13.688217 |
| 2 | 170 | 78.0 | 3079.038997 | 2018 | 14.246341 |
| 3 | 220 | NaN | 2542.392402 | 2009 | 16.912736 |
| 4 | 210 | 140.0 | 3460.870990 | 2009 | 12.488369 |
Distribution of fuel mpg
sns.histplot(df.fuel_efficiency_mpg, bins=50)The distribution of the target variable fuel_efficiency_mpg looks like the normal distribution so no need the log transformation.
Handling missing values
df.isnull().sum()engine_displacement 0
horsepower 708
vehicle_weight 0
model_year 0
fuel_efficiency_mpg 0
dtype: int64
Column horsepower has missing values.
Median of horsepower
median_horsepower = df.horsepower.median()
median_horsepowernp.float64(149.0)
Setup the framework
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
n, n_val + n_test + n_train(9704, 9704)
Data partitioning
Split the data in train/val/test sets, with 60%/20%/20% distribution.
n_val, n_test, n_train(1940, 1940, 5824)
Data shuffling
Shuffle the dataset, I use seed 42.
# let's shuffle the records with index
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train.head()| engine_displacement | horsepower | vehicle_weight | model_year | fuel_efficiency_mpg | |
|---|---|---|---|---|---|
| 483 | 220 | 144.0 | 2535.887591 | 2009 | 16.642943 |
| 7506 | 160 | 141.0 | 2741.170484 | 2019 | 16.298377 |
| 8795 | 230 | 155.0 | 2471.880237 | 2017 | 18.591822 |
| 1688 | 150 | 206.0 | 3748.164469 | 2015 | 11.818843 |
| 6217 | 300 | 111.0 | 2135.716359 | 2006 | 19.402209 |
len(df_train), len(df_val), len(df_test)(5824, 1940, 1940)
Restating the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
y_trainarray([16.64294342, 16.29837715, 18.59182197, ..., 17.59323665,
18.92574819, 17.96528447], shape=(5824,))
Baseline model to predict fuel_efficiency_mpg
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
def prepare_X_0(df):
df = df.copy()
df_num = df[cols]
df_num = df_num.fillna(0)
X = df_num.values
return X
def prepare_X_mean(df):
df = df.copy()
horsepower_mean = df['horsepower'].mean()
df['horsepower'] = df['horsepower'].fillna(horsepower_mean).values
# df['age'] = 2020 - df.model_year
# features = cols + ['age']
df_num = df[cols]
X = df_num.values
return XWith 0 for missing values
X_train = prepare_X_0(df_train)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_train.dot(w)
score_0 = rmse(y_train, y_pred)
score_0np.float64(0.5202614265099076)
With mean for missing values
X_train = prepare_X_mean(df_train)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_train.dot(w)
score_mean = rmse(y_train, y_pred)
score_meannp.float64(0.46244121379599645)
Which options gives better RMSE?
Improve model with validation data set
X_train = prepare_X_0(df_train)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X_0(df_val)
y_pred = w0 + X_val.dot(w)
score_0 = rmse(y_val, y_pred).round(2)
score_0np.float64(0.52)
X_train = prepare_X_mean(df_train)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X_mean(df_val)
y_pred = w0 + X_val.dot(w)
score_mean = rmse(y_val, y_pred).round(2)
score_meannp.float64(0.46)
The option with mean for missing values gives better RMSE.
Best regularization parameter
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
X_train = prepare_X_0(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)
X_val = prepare_X_0(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred).round(2)
print(r, w0, score)0 28.827365474598718 0.52
0.01 24.926838421084422 0.52
0.1 11.239661085047366 0.52
1 1.7315979412398264 0.52
5 0.36380750207153073 0.52
10 0.18306246622476907 0.52
100 0.01841801730804525 0.52
The best regularization parameter is 0.01.
RMSE Standard Deviation
all_scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
np.random.seed(seed)
idx = np.arange(n)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
X_train = prepare_X_0(df_train)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X_0(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
all_scores.append(score)
print(seed, w0, score)0 27.376388746129475 0.5206531296297207
1 29.022468555840756 0.5213388912866506
2 25.904357611918527 0.5228069974913666
3 28.015450068187377 0.5159516741255491
4 25.93400040714799 0.5109129460116937
5 25.89950696140074 0.5283406460212935
6 25.91723093394924 0.5313910658190373
7 28.414324731359304 0.509067038739038
8 26.47841782776404 0.5147399129482789
9 27.513852582351465 0.513186590829269
np.std(all_scores).round(3)np.float64(0.007)
Evaluation on test
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X_0(df_full_train)
y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
X_test = prepare_X_0(df_test)
y_pred = w0 + X_test.dot(w)
rmse(y_test, y_pred)np.float64(0.5156261299185628)