import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, auc
from sklearn.model_selection import KFoldEvaluation
Import library
Data preparation
We will use the lead scoring Bank Marketing dataset https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df.head()| lead_source | industry | number_of_courses_viewed | annual_income | employment_status | location | interaction_count | lead_score | converted | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | paid_ads | NaN | 1 | 79450.0 | unemployed | south_america | 4 | 0.94 | 1 |
| 1 | social_media | retail | 1 | 46992.0 | employed | south_america | 1 | 0.80 | 0 |
| 2 | events | healthcare | 5 | 78796.0 | unemployed | australia | 3 | 0.69 | 1 |
| 3 | paid_ads | retail | 2 | 83843.0 | NaN | australia | 1 | 0.87 | 0 |
| 4 | referral | education | 3 | 85012.0 | self_employed | europe | 3 | 0.62 | 1 |
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numeric_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
for col in categorical_columns:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.columnsIndex(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
'employment_status', 'location', 'interaction_count', 'lead_score',
'converted'],
dtype='object')
df.isnull().sum()lead_source 128
industry 134
number_of_courses_viewed 0
annual_income 181
employment_status 100
location 63
interaction_count 0
lead_score 0
converted 0
dtype: int64
There are 5 columns with missing values: - lead_source as object - industry as object - annual_income as numeric - employment_status as object - location as object
For categorical features, I replace them with NA, and for numerical features I replace it with with 0.0
for col in categorical_columns:
df[col] = df[col].fillna("NA")
for col in numeric_columns:
df[col] = df[col].fillna(0.0)
df.isnull().sum()lead_source 0
industry 0
number_of_courses_viewed 0
annual_income 0
employment_status 0
location 0
interaction_count 0
lead_score 0
converted 0
dtype: int64
y = df['converted']
X = df.drop('converted', axis=1)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)
print(f"Train: {len(X_train)} ({len(X_train)/len(df)*100:.0f}%)")
print(f"Val: {len(X_val)} ({len(X_val)/len(df)*100:.0f}%)")
print(f"Test: {len(X_test)} ({len(X_test)/len(df)*100:.0f}%)")Train: 877 (60%)
Val: 292 (20%)
Test: 293 (20%)
ROC AUC feature importance
variables_to_test = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']
auc_scores = {}
for var in variables_to_test:
predictions = X_train[var]
auc = roc_auc_score(y_train, predictions)
if auc < 0.5:
predictions = -X_train[var]
auc = roc_auc_score(y_train, predictions)
auc_scores[var] = auc
print(f"{var}: {auc:.4f}")
best_var = max(auc_scores, key=auc_scores.get)
print(f"Highest AUC: {best_var} ({auc_scores[best_var]:.4f})")lead_score: 0.6111
number_of_courses_viewed: 0.7652
interaction_count: 0.7272
annual_income: 0.5446
Highest AUC: number_of_courses_viewed (0.7652)
retail is the most frequent category in industry with 203 occurrences.
Training the model
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train_encoded, y_train)
y_val_pred = model.predict_proba(X_val_encoded)[:, 1]
auc = roc_auc_score(y_val, y_val_pred >= 0.5)
print(f"AUC on validation dataset: {round(auc, 3)}")AUC on validation dataset: 0.691
Percision and Recall
scores = []
thresholds = np.arange(0.0, 1.01, 0.01)
for t in thresholds:
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
predict_positive = (y_val_pred >= t)
predict_negative = (y_val_pred < t)
TP = (predict_positive & actual_positive).sum()
TN = (predict_negative & actual_negative).sum()
FP = (predict_positive & actual_negative).sum()
FN = (predict_negative & actual_positive).sum()
if TP + FP > 0:
precision = TP / (TP + FP)
else:
precision = 0.0
if TP + FN > 0:
recall = TP / (TP + FN)
else:
recall = 0.0
scores.append([t, precision, recall])
df_scores = pd.DataFrame(data=scores, columns=["threshold", "precision", "recall"])
plt.plot(df_scores.threshold, df_scores.precision, label='Precision')
plt.plot(df_scores.threshold, df_scores.recall, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.grid()precisions = df_scores.precision
recalls = df_scores.recall
differences = np.abs(precisions - recalls)
intersection_idx = np.argmin(differences)
intersection_threshold = thresholds[intersection_idx]
print(f"Precision and recall intersect at threshold: {intersection_threshold:.3f}")
print(f"Precision: {precisions[intersection_idx]:.3f}")
print(f"Recall: {recalls[intersection_idx]:.3f}")Precision and recall intersect at threshold: 0.590
Precision: 0.807
Recall: 0.807
F1 score
df_scores["f1"] = 2*(df_scores.precision * df_scores.recall) / (df_scores.precision + df_scores.recall)
max_score_row = df_scores[df_scores.f1 == df_scores.f1.max()]
print(max_score_row)
plt.plot(df_scores.threshold, df_scores.f1, label='F1')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.axvline(x=float(max_score_row.iloc[0].threshold), color='r', linestyle='--') threshold precision recall f1
47 0.47 0.767932 0.947917 0.848485
5-fold cross validataion
X_train_full = np.concatenate([X_train_encoded, X_val_encoded], axis=0)
y_train_full = np.concatenate([y_train, y_val], axis=0)
print(f"X_train_full shape: {X_train_full.shape}")
print(f"y_train_full shape: {y_train_full.shape}")
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
auc_scores = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_full), 1):
X_fold_train = X_train_full[train_idx]
X_fold_val = X_train_full[val_idx]
y_fold_train = y_train_full[train_idx]
y_fold_val = y_train_full[val_idx]
model_fold = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model_fold.fit(X_fold_train, y_fold_train)
y_fold_pred = model_fold.predict_proba(X_fold_val)[:, 1]
auc_fold = roc_auc_score(y_fold_val, y_fold_pred)
auc_scores.append(auc_fold)
print(f"Fold {fold}: AUC = {auc_fold:.4f}")
# Calculate statistics
auc_scores = np.array(auc_scores)
mean_auc = auc_scores.mean()
std_auc = auc_scores.std()
print(f"Mean AUC: {mean_auc:.4f} +- {std_auc:.4f}")X_train_full shape: (1169, 31)
y_train_full shape: (1169,)
Fold 1: AUC = 0.8067
Fold 2: AUC = 0.8068
Fold 3: AUC = 0.8648
Fold 4: AUC = 0.8334
Fold 5: AUC = 0.8154
Mean AUC: 0.8254 +- 0.0220
Hyperparameter tuning
C_values = [0.000001, 0.001, 1]
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
results = {}
for C in C_values:
print(f"Testing C = {C}")
auc_scores = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_full), 1):
X_fold_train = X_train_full[train_idx]
X_fold_val = X_train_full[val_idx]
y_fold_train = y_train_full[train_idx]
y_fold_val = y_train_full[val_idx]
model_fold = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
model_fold.fit(X_fold_train, y_fold_train)
y_fold_pred = model_fold.predict_proba(X_fold_val)[:, 1]
auc_fold = roc_auc_score(y_fold_val, y_fold_pred)
auc_scores.append(auc_fold)
auc_scores = np.array(auc_scores)
mean_auc = round(auc_scores.mean(), 3)
std_auc = round(auc_scores.std(), 3)
results[C] = {'mean': mean_auc, 'std': std_auc}
print(f"C = {C}: mean AUC = {mean_auc}, std = {std_auc}")
for C in C_values:
print(f"C = {C:>8}: mean = {results[C]['mean']:.3f}, std = {results[C]['std']:.3f}")
best_mean = max(r['mean'] for r in results.values())
candidates = [C for C in C_values if results[C]['mean'] == best_mean]
best_C = min(candidates)
print(f"Best C: {best_C} (mean = {results[best_C]['mean']:.3f}, std = {results[best_C]['std']:.3f})")Testing C = 1e-06
C = 1e-06: mean AUC = 0.543, std = 0.025
Testing C = 0.001
C = 0.001: mean AUC = 0.864, std = 0.014
Testing C = 1
C = 1: mean AUC = 0.825, std = 0.022
C = 1e-06: mean = 0.543, std = 0.025
C = 0.001: mean = 0.864, std = 0.014
C = 1: mean = 0.825, std = 0.022
Best C: 0.001 (mean = 0.864, std = 0.014)