Trees

Author

Alfa Pradana

Import library

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text, plot_tree
from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestRegressor

def rmse(y, y_pred):
    err = y - y_pred
    se = err ** 2
    mse = se.mean()
    return np.sqrt(mse)

Data preparation

df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
df.head()
engine_displacement num_cylinders horsepower vehicle_weight acceleration model_year origin fuel_type drivetrain num_doors fuel_efficiency_mpg
0 170 3.0 159.0 3413.433759 17.7 2003 Europe Gasoline All-wheel drive 0.0 13.231729
1 130 5.0 97.0 3149.664934 17.8 2007 USA Gasoline Front-wheel drive 0.0 13.688217
2 170 NaN 78.0 3079.038997 15.1 2018 Europe Gasoline Front-wheel drive 0.0 14.246341
3 220 4.0 NaN 2542.392402 20.2 2009 USA Diesel All-wheel drive 2.0 16.912736
4 210 1.0 140.0 3460.870990 14.4 2009 Europe Gasoline All-wheel drive 2.0 12.488369
df_car = df.fillna(0)
df.isnull().sum()
engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

Partitioning data

df_full_train, df_test = train_test_split(df_car, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]

One-hot encoding

train_dicts = df_train.to_dict(orient="records")
val_dicts = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

Decision tree regressor

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names=dv.get_feature_names_out()))
|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]

vehicle_weight is used for splitting the data

Random forest regressor

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
print('RMSE on the validation dataset = %.5f' % rmse(y_val, y_pred))
RMSE on the validation dataset = 0.45998

Tuning hyperparameter

Number of estimators

scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse_val = rmse(y_val, y_pred)
    print('n_estimators=%d RMSE=%.3f' % (n, rmse_val))

    scores.append((n, rmse_val))
n_estimators=10 RMSE=0.460
n_estimators=20 RMSE=0.454
n_estimators=30 RMSE=0.451
n_estimators=40 RMSE=0.448
n_estimators=50 RMSE=0.446
n_estimators=60 RMSE=0.445
n_estimators=70 RMSE=0.445
n_estimators=80 RMSE=0.445
n_estimators=90 RMSE=0.445
n_estimators=100 RMSE=0.444
n_estimators=110 RMSE=0.443
n_estimators=120 RMSE=0.444
n_estimators=130 RMSE=0.443
n_estimators=140 RMSE=0.443
n_estimators=150 RMSE=0.443
n_estimators=160 RMSE=0.443
n_estimators=170 RMSE=0.443
n_estimators=180 RMSE=0.442
n_estimators=190 RMSE=0.443
n_estimators=200 RMSE=0.443
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
plt.plot(df_scores.n_estimators, df_scores.rmse)

Max depth

scores = []
mean_rmse = []
for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse_val = rmse(y_val, y_pred)
        # print('max_depth=%d n_estimators=%d RMSE=%.3f' % (d, n, rmse_val))

        scores.append((d, n, rmse_val))
    
    # Calculate mean RMSE for this depth
    rmse_values = [s[2] for s in scores if s[0] == d]
    mean_rmse.append((d, np.mean(rmse_values)))

df_scores = pd.DataFrame(scores, columns=['max_depth', 'n_estimators', 'rmse'])
df_scores
max_depth n_estimators rmse
0 10 10 0.451895
1 10 20 0.448719
2 10 30 0.446225
3 10 40 0.443877
4 10 50 0.442682
... ... ... ...
75 25 160 0.442689
76 25 170 0.442767
77 25 180 0.442415
78 25 190 0.442618
79 25 200 0.442660

80 rows × 3 columns

mean_rmse
[(10, np.float64(0.44232130237115186)),
 (15, np.float64(0.44505999920137435)),
 (20, np.float64(0.4456441321803526)),
 (25, np.float64(0.44566060000292457))]

Feature importance

rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feature_importances
vehicle_weight                  0.959162
horsepower                      0.016040
acceleration                    0.011471
engine_displacement             0.003269
model_year                      0.003182
num_cylinders                   0.002359
num_doors                       0.001591
origin=USA                      0.000555
origin=Europe                   0.000520
origin=Asia                     0.000476
drivetrain=All-wheel drive      0.000382
fuel_type=Diesel                0.000344
fuel_type=Gasoline              0.000337
drivetrain=Front-wheel drive    0.000312
dtype: float64
sns.barplot(x=feature_importances.values, y=feature_importances.index)

XGBoost Regressor

features = dv.get_feature_names_out().tolist()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
watchlist = [(dtrain, 'train'), (dval, 'val')]
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

y_pred = model.predict(dval)
rmse(y_val, y_pred)
np.float64(0.45017755678087246)
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

y_pred = model.predict(dval)
rmse(y_val, y_pred)
np.float64(0.42622800553359225)

The best RMSE score is achieved from eta=0.1 with 0.426