Load library
import pandas as pd
import numpy as np
Data preparation
car = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
car.head()
0 |
170 |
3.0 |
159.0 |
3413.433759 |
17.7 |
2003 |
Europe |
Gasoline |
All-wheel drive |
0.0 |
13.231729 |
1 |
130 |
5.0 |
97.0 |
3149.664934 |
17.8 |
2007 |
USA |
Gasoline |
Front-wheel drive |
0.0 |
13.688217 |
2 |
170 |
NaN |
78.0 |
3079.038997 |
15.1 |
2018 |
Europe |
Gasoline |
Front-wheel drive |
0.0 |
14.246341 |
3 |
220 |
4.0 |
NaN |
2542.392402 |
20.2 |
2009 |
USA |
Diesel |
All-wheel drive |
2.0 |
16.912736 |
4 |
210 |
1.0 |
140.0 |
3460.870990 |
14.4 |
2009 |
Europe |
Gasoline |
All-wheel drive |
2.0 |
12.488369 |
Fuel types
count 9704
unique 2
top Gasoline
freq 4898
Name: fuel_type, dtype: object
The distribution of the target variable fuel_efficiency_mpg
looks like the normal distribution so no need the log transformation.
Handling missing values
engine_displacement 0
num_cylinders 482
horsepower 708
vehicle_weight 0
acceleration 930
model_year 0
origin 0
fuel_type 0
drivetrain 0
num_doors 502
fuel_efficiency_mpg 0
dtype: int64
Max fuel efficiency
car.groupby('origin').fuel_efficiency_mpg.max()
origin
Asia 23.759123
Europe 25.967222
USA 24.971452
Name: fuel_efficiency_mpg, dtype: float64
Sum of weights
asian_car = car[car['origin'] == 'Asia']
asian_car
8 |
250 |
1.0 |
174.0 |
2714.219310 |
10.3 |
2016 |
Asia |
Diesel |
Front-wheel drive |
-1.0 |
16.823554 |
12 |
320 |
5.0 |
145.0 |
2783.868974 |
15.1 |
2010 |
Asia |
Diesel |
All-wheel drive |
1.0 |
16.175820 |
14 |
200 |
6.0 |
160.0 |
3582.687368 |
14.9 |
2007 |
Asia |
Diesel |
All-wheel drive |
0.0 |
11.871091 |
20 |
150 |
3.0 |
197.0 |
2231.808142 |
18.7 |
2011 |
Asia |
Gasoline |
Front-wheel drive |
1.0 |
18.889083 |
21 |
160 |
4.0 |
133.0 |
2659.431451 |
NaN |
2016 |
Asia |
Gasoline |
Front-wheel drive |
-1.0 |
16.077730 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
9688 |
260 |
4.0 |
152.0 |
3948.404625 |
15.5 |
2018 |
Asia |
Diesel |
All-wheel drive |
-1.0 |
11.054830 |
9692 |
180 |
3.0 |
188.0 |
3680.341381 |
18.0 |
2016 |
Asia |
Gasoline |
Front-wheel drive |
1.0 |
11.711653 |
9693 |
280 |
2.0 |
148.0 |
2545.070139 |
15.6 |
2012 |
Asia |
Diesel |
All-wheel drive |
0.0 |
17.202782 |
9698 |
180 |
1.0 |
131.0 |
3107.427820 |
13.2 |
2005 |
Asia |
Gasoline |
Front-wheel drive |
-2.0 |
13.933716 |
9703 |
270 |
3.0 |
140.0 |
2908.043477 |
14.7 |
2005 |
Asia |
Diesel |
All-wheel drive |
-1.0 |
14.884467 |
3247 rows × 11 columns
sel_asian_car = asian_car[['vehicle_weight', 'model_year']].head(7)
sel_asian_car
8 |
2714.219310 |
2016 |
12 |
2783.868974 |
2010 |
14 |
3582.687368 |
2007 |
20 |
2231.808142 |
2011 |
21 |
2659.431451 |
2016 |
34 |
2844.227534 |
2014 |
38 |
3761.994038 |
2019 |
X = sel_asian_car.values
X
array([[2714.21930965, 2016. ],
[2783.86897424, 2010. ],
[3582.68736772, 2007. ],
[2231.8081416 , 2011. ],
[2659.43145076, 2016. ],
[2844.22753389, 2014. ],
[3761.99403819, 2019. ]])
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
w = XTX_inv.dot(X.T).dot(y)
w.sum()
np.float64(0.5187709081074016)