Linear Regression

Author

Alfa Pradana

Load library

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Define function

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    err = y - y_pred
    se = err ** 2
    mse = se.mean()
    return np.sqrt(mse)

Data preparation

car = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
car.head()
engine_displacement num_cylinders horsepower vehicle_weight acceleration model_year origin fuel_type drivetrain num_doors fuel_efficiency_mpg
0 170 3.0 159.0 3413.433759 17.7 2003 Europe Gasoline All-wheel drive 0.0 13.231729
1 130 5.0 97.0 3149.664934 17.8 2007 USA Gasoline Front-wheel drive 0.0 13.688217
2 170 NaN 78.0 3079.038997 15.1 2018 Europe Gasoline Front-wheel drive 0.0 14.246341
3 220 4.0 NaN 2542.392402 20.2 2009 USA Diesel All-wheel drive 2.0 16.912736
4 210 1.0 140.0 3460.870990 14.4 2009 Europe Gasoline All-wheel drive 2.0 12.488369
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = car[base].copy()
df.head()
engine_displacement horsepower vehicle_weight model_year fuel_efficiency_mpg
0 170 159.0 3413.433759 2003 13.231729
1 130 97.0 3149.664934 2007 13.688217
2 170 78.0 3079.038997 2018 14.246341
3 220 NaN 2542.392402 2009 16.912736
4 210 140.0 3460.870990 2009 12.488369

Distribution of fuel mpg

sns.histplot(df.fuel_efficiency_mpg, bins=50)

The distribution of the target variable fuel_efficiency_mpg looks like the normal distribution so no need the log transformation.

Handling missing values

df.isnull().sum()
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Column horsepower has missing values.

Median of horsepower

median_horsepower = df.horsepower.median()
median_horsepower
np.float64(149.0)

Setup the framework

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
n, n_val + n_test + n_train
(9704, 9704)

Data partitioning

Split the data in train/val/test sets, with 60%/20%/20% distribution.

n_val, n_test, n_train
(1940, 1940, 5824)

Data shuffling

Shuffle the dataset, I use seed 42.

# let's shuffle the records with index
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train.head()
engine_displacement horsepower vehicle_weight model_year fuel_efficiency_mpg
483 220 144.0 2535.887591 2009 16.642943
7506 160 141.0 2741.170484 2019 16.298377
8795 230 155.0 2471.880237 2017 18.591822
1688 150 206.0 3748.164469 2015 11.818843
6217 300 111.0 2135.716359 2006 19.402209
len(df_train), len(df_val), len(df_test)
(5824, 1940, 1940)

Restating the index

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]

y_train
array([16.64294342, 16.29837715, 18.59182197, ..., 17.59323665,
       18.92574819, 17.96528447], shape=(5824,))

Baseline model to predict fuel_efficiency_mpg

cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
def prepare_X_0(df):
    df = df.copy()

    df_num = df[cols]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

def prepare_X_mean(df):
    df = df.copy()

    horsepower_mean = df['horsepower'].mean()
    df['horsepower'] = df['horsepower'].fillna(horsepower_mean).values

    # df['age'] = 2020 - df.model_year
    # features = cols + ['age']

    df_num = df[cols]
    X = df_num.values
    return X

With 0 for missing values

X_train = prepare_X_0(df_train)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_train.dot(w)
score_0 = rmse(y_train, y_pred)
score_0
np.float64(0.5202614265099076)

With mean for missing values

X_train = prepare_X_mean(df_train)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_train.dot(w)
score_mean = rmse(y_train, y_pred)
score_mean
np.float64(0.46244121379599645)

Which options gives better RMSE?

Improve model with validation data set

X_train = prepare_X_0(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X_0(df_val)
y_pred = w0 + X_val.dot(w)
score_0 = rmse(y_val, y_pred).round(2)
score_0
np.float64(0.52)
X_train = prepare_X_mean(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X_mean(df_val)
y_pred = w0 + X_val.dot(w)
score_mean = rmse(y_val, y_pred).round(2)
score_mean
np.float64(0.46)

The option with mean for missing values gives better RMSE.

Best regularization parameter

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_X_0(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X_0(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred).round(2)

    print(r, w0, score)
0 28.827365474598718 0.52
0.01 24.926838421084422 0.52
0.1 11.239661085047366 0.52
1 1.7315979412398264 0.52
5 0.36380750207153073 0.52
10 0.18306246622476907 0.52
100 0.01841801730804525 0.52

The best regularization parameter is 0.01.

RMSE Standard Deviation

all_scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values

    del df_train["fuel_efficiency_mpg"]
    del df_val["fuel_efficiency_mpg"]
    del df_test["fuel_efficiency_mpg"]

    X_train = prepare_X_0(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X_0(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    all_scores.append(score)

    print(seed, w0, score)
0 27.376388746129475 0.5206531296297207
1 29.022468555840756 0.5213388912866506
2 25.904357611918527 0.5228069974913666
3 28.015450068187377 0.5159516741255491
4 25.93400040714799 0.5109129460116937
5 25.89950696140074 0.5283406460212935
6 25.91723093394924 0.5313910658190373
7 28.414324731359304 0.509067038739038
8 26.47841782776404 0.5147399129482789
9 27.513852582351465 0.513186590829269
np.std(all_scores).round(3)
np.float64(0.007)

Evaluation on test

np.random.seed(9)

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]

df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

X_full_train = prepare_X_0(df_full_train)

y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

X_test = prepare_X_0(df_test)
y_pred = w0 + X_test.dot(w)
rmse(y_test, y_pred)
np.float64(0.5156261299185628)