import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionClassification
Import library
Data preparation
We will use the lead scoring Bank Marketing dataset https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df.head()| lead_source | industry | number_of_courses_viewed | annual_income | employment_status | location | interaction_count | lead_score | converted | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | paid_ads | NaN | 1 | 79450.0 | unemployed | south_america | 4 | 0.94 | 1 |
| 1 | social_media | retail | 1 | 46992.0 | employed | south_america | 1 | 0.80 | 0 |
| 2 | events | healthcare | 5 | 78796.0 | unemployed | australia | 3 | 0.69 | 1 |
| 3 | paid_ads | retail | 2 | 83843.0 | NaN | australia | 1 | 0.87 | 0 |
| 4 | referral | education | 3 | 85012.0 | self_employed | europe | 3 | 0.62 | 1 |
Missing values checking
df.dtypeslead_source object
industry object
number_of_courses_viewed int64
annual_income float64
employment_status object
location object
interaction_count int64
lead_score float64
converted int64
dtype: object
df.isnull().sum()lead_source 128
industry 134
number_of_courses_viewed 0
annual_income 181
employment_status 100
location 63
interaction_count 0
lead_score 0
converted 0
dtype: int64
There are 5 columns with missing values: - lead_source as object - industry as object - annual_income as numeric - employment_status as object - location as object
For categorical features, I replace them with NA, and for numerical features I replace it with with 0.0
categorical = df.dtypes[df.dtypes == "object"].index.tolist()
categorical['lead_source', 'industry', 'employment_status', 'location']
numerical = df.dtypes[df.dtypes != "object"].index.tolist()
numerical['number_of_courses_viewed',
'annual_income',
'interaction_count',
'lead_score',
'converted']
for col in categorical:
df[col] = df[col].fillna("NA")
for col in numerical:
df[col] = df[col].fillna(0.0)
df.isnull().sum()lead_source 0
industry 0
number_of_courses_viewed 0
annual_income 0
employment_status 0
location 0
interaction_count 0
lead_score 0
converted 0
dtype: int64
The most frequent category in industry
df["industry"].value_counts()industry
retail 203
finance 200
other 198
healthcare 187
education 187
technology 179
manufacturing 174
NA 134
Name: count, dtype: int64
retail is the most frequent category in industry with 203 occurrences.
Correlation matrix for the numerical features
corr_mtx = df[numerical].corr()
corr_mtx.style.background_gradient(cmap='coolwarm')| number_of_courses_viewed | annual_income | interaction_count | lead_score | converted | |
|---|---|---|---|---|---|
| number_of_courses_viewed | 1.000000 | 0.009770 | -0.023565 | -0.004879 | 0.435914 |
| annual_income | 0.009770 | 1.000000 | 0.027036 | 0.015610 | 0.053131 |
| interaction_count | -0.023565 | 0.027036 | 1.000000 | 0.009888 | 0.374573 |
| lead_score | -0.004879 | 0.015610 | 0.009888 | 1.000000 | 0.193673 |
| converted | 0.435914 | 0.053131 | 0.374573 | 0.193673 | 1.000000 |
df[['number_of_courses_viewed', 'annual_income', 'interaction_count']].corrwith(df.lead_score).abs()number_of_courses_viewed 0.004879
annual_income 0.015610
interaction_count 0.009888
dtype: float64
df[['number_of_courses_viewed', 'annual_income', 'lead_score']].corrwith(df.interaction_count).abs()number_of_courses_viewed 0.023565
annual_income 0.027036
lead_score 0.009888
dtype: float64
Based on the given correlation matrix above, the biggest correlation is annual_income with interaction_count with correlation of 0.027036.
Splitting the data
Perform the train/validation/test split with Scikit-Learn’s train_test_split() function.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)(1169, 293)
If we can get the 20% of the data for validation and 60% for training, we need to set the test_size parameter to 0.25 (because 0.25 * 0.8 = 0.2).
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)(876, 293, 293)
Make sure that the target value converted is not in the dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values
del df_train["converted"]
del df_val["converted"]
del df_test["converted"]Feature engineering
The biggest of mutual information
Mutual information is the concept from information theory that measures the amount of information obtained about one random variable through another random variable. In the context of feature selection for machine learning, mutual information can be used to quantify the dependency between a feature and the target variable.
def mutual_info_category_score(series):
return mutual_info_score(series, y_train)
mi = df_train[categorical].apply(mutual_info_category_score)
mi.sort_values(ascending=False).round(2)lead_source 0.04
employment_status 0.01
industry 0.01
location 0.00
dtype: float64
The biggest mutual information is lead_source with 0.04.
len(df_train), len(df_val), len(df_test)(876, 293, 293)
One-hot encoding
df_train[categorical].nunique()lead_source 6
industry 8
employment_status 5
location 8
dtype: int64
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
train_dicts[0]{'lead_source': 'paid_ads',
'industry': 'retail',
'employment_status': 'student',
'location': 'middle_east',
'number_of_courses_viewed': 0,
'annual_income': 58472.0,
'interaction_count': 5,
'lead_score': 0.03}
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
dv.get_feature_names_out()array(['annual_income', 'employment_status=NA',
'employment_status=employed', 'employment_status=self_employed',
'employment_status=student', 'employment_status=unemployed',
'industry=NA', 'industry=education', 'industry=finance',
'industry=healthcare', 'industry=manufacturing', 'industry=other',
'industry=retail', 'industry=technology', 'interaction_count',
'lead_score', 'lead_source=NA', 'lead_source=events',
'lead_source=organic_search', 'lead_source=paid_ads',
'lead_source=referral', 'lead_source=social_media', 'location=NA',
'location=africa', 'location=asia', 'location=australia',
'location=europe', 'location=middle_east',
'location=north_america', 'location=south_america',
'number_of_courses_viewed'], dtype=object)
X_train = dv.transform(train_dicts)
X_train.shape(876, 31)
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)Logistic Regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
Accuracy on the validation dataset
y_pred = model.predict_proba(X_val)[:, 1]
conv_decision = (y_pred >= 0.5)
original_accuracy = (y_val == conv_decision).mean()
print(f"Original accuracy with all features: {original_accuracy:.2f}\n")Original accuracy with all features: 0.70
Original accuracy with all features: 0.70
Feature elimination based on the accuracy
features = categorical + numerical
for feature in features:
selected_features = [f for f in features if f != feature]
train_dicts = df_train[selected_features].to_dict(orient='records')
X_train = dv.transform(train_dicts)
val_dicts = df_val[selected_features].to_dict(orient='records')
X_val = dv.transform(val_dicts)
# train the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# evaluate the model
y_pred = model.predict_proba(X_val)[:, 1]
conv_decision = (y_pred >= 0.5)
accuracy = (y_val == conv_decision).mean()
diff = np.abs(accuracy - original_accuracy)
print(f"{accuracy:.5f} (difference: {diff:.5f}) accuracy with no '{feature}'")0.70307 (difference: 0.00341) accuracy with no 'lead_source'
0.69966 (difference: 0.00000) accuracy with no 'industry'
0.69625 (difference: 0.00341) accuracy with no 'employment_status'
0.70990 (difference: 0.01024) accuracy with no 'location'
0.55631 (difference: 0.14334) accuracy with no 'number_of_courses_viewed'
0.85324 (difference: 0.15358) accuracy with no 'annual_income'
0.55631 (difference: 0.14334) accuracy with no 'interaction_count'
0.70648 (difference: 0.00683) accuracy with no 'lead_score'
The smallest difference is industry, it means that if we drop industry feature, the accuracy will be the same as the original accuracy with all features.
Best regularization parameter
params = [0.01, 0.1, 1, 10, 100]
for C in params:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.transform(train_dicts)
model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
conv_decision = (y_pred >= 0.5)
accuracy = (y_val == conv_decision).mean()
print(f"{accuracy:.3f} for parameter C={C}")0.700 for parameter C=0.01
0.706 for parameter C=0.1
0.706 for parameter C=1
0.706 for parameter C=10
0.706 for parameter C=100
The best tuning parameter C is 0.01.