import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth',None)

import scipy.stats as ss
import shap

from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier


cyclists = pd.read_csv('cyclists.csv')


# Define a binary feature HILL to replace GRADE
cyclists['HILL'] = cyclists.GRADE.isin(['downhill','uphill','bottom_hill','top_hill']).astype(int)

# I'll compute groupwise mode (with dropna = False) to impute missing SEX data in the pipeline.
# Mode can't compare 'F' or 'M' with np.nan which can cause issues when multiple modes exist
# Note that among floats, 0 < 1 < np.nan
cyclists['FEMALE'] = cyclists['SEX'].replace({'F':1,'M':0})


# Feature subsets
feat_dict = {}
feat_dict['all'] = {'cyc':['DAY_OF_WEEK','HOUR_OF_DAY'],
                    'ord':['BUS_COUNT','HEAVY_TRUCK_COUNT','COMM_VEH_COUNT',
                           'SMALL_TRUCK_COUNT','SUV_COUNT','VAN_COUNT'],
                    'cat':['RESTRAINT_HELMET','VEH_ROLE',
                           'URBAN_RURAL','ILLUMINATION',
                           'ROAD_CONDITION','WEATHER',
                           'COLLISION_TYPE','IMPACT_SIDE',
                           'TCD_TYPE','TCD_FUNC_CD'],
                    'group':['MUNICIPALITY','COUNTY','CRASH_MONTH'],
                    'num':['AGE','SPEED_LIMIT','CRASH_YEAR'],
                    'bin':['FEMALE','HILL',
                           'NON_INTERSECTION','CURVED_ROAD',
                           'ALCOHOL_RELATED','CURVE_DVR_ERROR',
                          'DRINKING_DRIVER','DRUGGED_DRIVER','DRUG_RELATED',
                          'DISTRACTED','FATIGUE_ASLEEP',
                          'AGGRESSIVE_DRIVING','CELL_PHONE','LANE_DEPARTURE',
                          'NO_CLEARANCE','NHTSA_AGG_DRIVING','CROSS_MEDIAN',
                          'RUNNING_RED_LT','RUNNING_STOP_SIGN','TAILGATING',
                          'SPEEDING_RELATED',
                          'MATURE_DRIVER','YOUNG_DRIVER',
                          'SUDDEN_DEER','WORK_ZONE']
                     }
TARGET = 'SERIOUS_OR_FATALITY'


df = cyclists[[feat for feat_type in feat_dict['all'] for feat in feat_dict['all'][feat_type]]+[TARGET]]
X = df.drop(TARGET,axis=1)
y= df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=42)


from lib.study_classif import ClassifierStudy


cyc_results = {}
for method in [None,'ord','onehot','trig','spline','interact-trig','interact-spline']:
    clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',class_weight='balanced')
    study = ClassifierStudy(classifier = clf, X = X_train,y = y_train,features = feat_dict['all'])
    study.build_pipeline(cyc_method = method)
    cyc_results[method] = study.cv_score(print_mean_score = False, return_mean_score = True)
pd.DataFrame(cyc_results,index=['mean of cv scores']).sort_values(axis=1,by='mean of cv scores',ascending=False)


from lib.study_classif import LRStudy


clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',class_weight='balanced')
study = LRStudy(clf,X_train,y_train,feat_dict['all'])
study.build_pipeline()
study.plot_coeff(title_add = 'L2-regularization only')

Score on validation set: 0.72338032152522


clf = LogisticRegression(max_iter=1000,solver='saga',penalty='l1',class_weight='balanced')
study = LRStudy(clf,X_train,y_train,feat_dict['all'])
study.build_pipeline()
study.plot_coeff(title_add = 'L1-regularization only')

Score on validation set: 0.7230131258396489


clf = LogisticRegression(max_iter=1000,solver='saga',penalty='l1',class_weight='balanced',C=0.1)
study = LRStudy(clf,X_train,y_train,feat_dict['all'])
study.build_pipeline()
study.plot_coeff(title_add = 'stronger L1-regularization')

Score on validation set: 0.7272700246189844


for tcd_type in ['traffic_signal','flashing_traffic_signal','stop_sign']:
    cyclists[f'TCD_{tcd_type}'] = (cyclists.TCD_TYPE==tcd_type).astype(int)
# cyclists['TCD_functioning'] = (cyclists.TCD_FUNC_CD=='functioning_properly').astype(int)

feat_dict['all']['bin'] += [f'TCD_{tcd_type}' for tcd_type in ['traffic_signal','flashing_traffic_signal','stop_sign']]

df = cyclists[[feat for feat_type in feat_dict['all'] for feat in feat_dict['all'][feat_type]]+[TARGET]]
X = df.drop(TARGET,axis=1)
y= df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=42)

feat_dict['lr_small'] = {}

drop_feat = ['BUS_COUNT','WEATHER','ROAD_CONDITION',
             'TCD_TYPE','TCD_FUNC_CD',
             'DISTRACTED','DRUGGED_DRIVER','DRINKING_DRIVER',
            'FATIGUE_ASLEEP', 'CELL_PHONE',
            'SUDDEN_DEER', 'WORK_ZONE']

for feat_type in feat_dict['all']:
    feat_dict['lr_small'][feat_type] = [feat for feat in feat_dict['all'][feat_type] if feat not in drop_feat]

for tcd_type in ['traffic_signal','flashing_traffic_signal','stop_sign']:
    feat_dict['all']['bin'].remove(f'TCD_{tcd_type}')


clf = LogisticRegression(max_iter=1000,solver='saga',penalty='l1',class_weight='balanced',C=0.1)
study = LRStudy(clf,X_train,y_train,features=feat_dict['lr_small'])
study.build_pipeline(cyc_method=None)
study.plot_coeff('stronger L1-regularization and fewer features')

Score on validation set: 0.729097968130307


clf = LogisticRegression(max_iter=1000,solver='saga',penalty='l1',class_weight='balanced',C=0.1)
study = LRStudy(clf,X_train,y_train,features=feat_dict['lr_small'])
study.build_pipeline()
study.plot_coeff('stronger L1-regularization and fewer features')

Score on validation set: 0.7285829299805233


large_coeff = study.coeff[np.abs(study.coeff['coeff value'])>0.25].sort_values(by='coeff value',ascending=False)
large_coeff.style.background_gradient(axis=0,gmap=-large_coeff['coeff value'],cmap='RdBu')


%%time
clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',class_weight='balanced',C=0.1)
study_nc = LRStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_nc.build_pipeline()
study_nc.cv_score()

Mean CV roc_auc score: 0.7372753688357127
CPU times: user 7.53 s, sys: 203 ms, total: 7.73 s
Wall time: 5.71 s


%%time
clf = LogisticRegression(max_iter=1000,solver='saga',penalty='l1',class_weight='balanced',C=0.1)
study_saga = LRStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_saga.build_pipeline()
study_saga.cv_score()

Mean CV roc_auc score: 0.7360723698622664
CPU times: user 9.16 s, sys: 93.3 ms, total: 9.25 s
Wall time: 9.26 s


study_saga.pipe[:-1].fit_transform(X_train).shape[1]

85


%%time
# Use the randomized_search function in order to try a range of n_components values
clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',class_weight='balanced',C=0.1)
study_nc_pca = LRStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_nc_pca.build_pipeline(pca=True)
study_nc_pca.randomized_search(params={},n_components = list(range(60,86)),n_iter=25)

CPU times: user 1.14 s, sys: 240 ms, total: 1.38 s
Wall time: 16.5 s


%%time
clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',class_weight='balanced',C=0.1)
study_nc = LRStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_nc.build_pipeline()
params = {'C':ss.loguniform(0.02,10)}
study_nc.randomized_search(params,refit=True)

CPU times: user 888 ms, sys: 97.4 ms, total: 986 ms
Wall time: 5.09 s


%%time
clf = LogisticRegression(max_iter=1000,solver='saga',penalty='elasticnet',class_weight='balanced')
study_saga_en =LRStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_saga_en.build_pipeline()

params = {'C':ss.loguniform(0.02,10),
         'l1_ratio':ss.uniform(0,1)}

study_saga_en.randomized_search(params,n_iter=50,refit=True)

CPU times: user 3.06 s, sys: 435 ms, total: 3.49 s
Wall time: 1min 5s


lr_params = {'C': 0.05273751067944609}


clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',
                         class_weight='balanced',**lr_params)
study_lr =ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_lr.build_pipeline()
study_lr.fit_pipeline(split_first=True)
study_lr.predict_proba_pipeline()


study_lr.find_best_threshold(beta=1)

Threshold optimizing F_1 score:   0.662141377258788
Best F_1 score:   0.29865361077111385

                                      precision    recall  f1-score   support

neither seriously injured nor killed       0.95      0.90      0.93      3989
         seriously injured or killed       0.24      0.39      0.30       312

                            accuracy                           0.87      4301
                           macro avg       0.60      0.65      0.61      4301
                        weighted avg       0.90      0.87      0.88      4301


study_lr.find_best_threshold(beta=4)

Threshold optimizing F_4 score:   0.2774721271485444
Best F_4 score:   0.5933574879227053

                                      precision    recall  f1-score   support

neither seriously injured nor killed       0.98      0.25      0.40      3989
         seriously injured or killed       0.09      0.93      0.16       312

                            accuracy                           0.30      4301
                           macro avg       0.53      0.59      0.28      4301
                        weighted avg       0.91      0.30      0.38      4301


study_lr.find_best_threshold(beta=3)

Threshold optimizing F_3 score:   0.42397093830815596
Best F_3 score:   0.49366816913500755

                                      precision    recall  f1-score   support

neither seriously injured nor killed       0.97      0.59      0.74      3989
         seriously injured or killed       0.12      0.74      0.21       312

                            accuracy                           0.60      4301
                           macro avg       0.55      0.67      0.47      4301
                        weighted avg       0.91      0.60      0.70      4301


%%time
n_trials = 100
thresh_list = []
for i in range(n_trials):
    clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',
                         class_weight='balanced',**lr_params)
    study_lr =ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'],random_state=42*i)
    study_lr.build_pipeline()
    study_lr.fit_pipeline(split_first=True)
    study_lr.predict_proba_pipeline()
    study_lr.find_best_threshold(beta=3,conf=False,report=False,print_result=False)
    thresh_list.append(study_lr.best_thresh)
lr_thresh = np.mean(thresh_list)
print(f'The average best threshold for F_3 over {n_trials} trials is {lr_thresh}')

The average best threshold for F_3 over 100 trials is 0.3943127118198778
CPU times: user 52.1 s, sys: 1.22 s, total: 53.4 s
Wall time: 36.7 s


lr_thresh = 0.3943127118198778


import time
clf_list = []
clf_list.append(
    ('GBC',GradientBoostingClassifier(
        n_estimators = 2000, n_iter_no_change=50,random_state=42
    ))
)
clf_list.append(
    ('HGBC',HistGradientBoostingClassifier(
        early_stopping=True,max_iter=2000,n_iter_no_change=50,random_state=42
    ))
)
clf_list.append(('LGBM',LGBMClassifier(
    n_estimators = 300, random_state=42, verbosity=-1, early_stopping_round=150
    ))
)

cv_scores = {}
for feature_set in ['all','lr_small']:
    cv_scores[feature_set] = []
    for clf in clf_list:
        start = time.time()
        study = ClassifierStudy(clf[1],X_train,y_train,feat_dict[feature_set])
        study.build_pipeline()
        score = study.cv_score(print_mean_score = False, return_mean_score = True)
        cv_scores[feature_set].append(score)
        end = time.time()
        print(f"The time elapsed for {clf[0]} with feature set '{feature_set}' was {end-start:.2f} seconds")

pd.DataFrame(cv_scores,index = [clf[0] for clf in clf_list])\
    .rename(columns={'all':'all features',
                    'lr_small':'small feature set'})

The time elapsed for GBC with feature set 'all' was 65.44 seconds
The time elapsed for HGBC with feature set 'all' was 21.21 seconds
Number of iterations used due to early stopping: 
Min:10.0...Max:76.0...Mean:40.1
The time elapsed for LGBM with feature set 'all' was 19.25 seconds
The time elapsed for GBC with feature set 'lr_small' was 48.61 seconds
The time elapsed for HGBC with feature set 'lr_small' was 18.08 seconds
Number of iterations used due to early stopping: 
Min:17.0...Max:81.0...Mean:38.1
The time elapsed for LGBM with feature set 'lr_small' was 18.68 seconds


import time
clf_list = []
clf_list.append(
    ('GBC',GradientBoostingClassifier(
        random_state=42
    ))
)
clf_list.append(
    ('HGBC',HistGradientBoostingClassifier(
        early_stopping=False,random_state=42
    ))
)
clf_list.append(('LGBM',LGBMClassifier(
    n_estimators = 50, random_state=42, verbosity=-1
    ))
)

cv_scores = {}
for feature_set in ['all','lr_small']:
    cv_scores[feature_set] = []
    for clf in clf_list:
        start = time.time()
        study = ClassifierStudy(clf[1],X_train,y_train,feat_dict[feature_set])
        study.build_pipeline()
        score = study.cv_score(print_mean_score = False, return_mean_score = True)
        cv_scores[feature_set].append(score)
        end = time.time()
        print(f"The time elapsed for {clf[0]} with feature set '{feature_set}' was {end-start:.2f} seconds")

pd.DataFrame(cv_scores,index = [clf[0] for clf in clf_list])\
    .rename(columns={'all':'all features',
                    'lr_small':'small feature set'})

The time elapsed for GBC with feature set 'all' was 45.66 seconds
The time elapsed for HGBC with feature set 'all' was 20.06 seconds
The time elapsed for LGBM with feature set 'all' was 8.00 seconds
The time elapsed for GBC with feature set 'lr_small' was 38.27 seconds
The time elapsed for HGBC with feature set 'lr_small' was 20.42 seconds
The time elapsed for LGBM with feature set 'lr_small' was 8.07 seconds


%%time
clf = LGBMClassifier(random_state=42,verbosity=-1)
study_lgb = ClassifierStudy(clf,X_train,y_train,feat_dict['all'])
study_lgb.build_pipeline()
params = {'learning_rate':ss.loguniform(0.01,0.5),
          'n_estimators':ss.randint(20,200),
          'max_depth':list(range(2,10))+[None],
          'reg_alpha':ss.loguniform(1,12),
          'reg_lambda':ss.loguniform(1,12),
          'min_child_samples':ss.randint(20,200),
          }
study_lgb.randomized_search(params, n_iter=200, refit=True)

CPU times: user 10.6 s, sys: 1.51 s, total: 12.1 s
Wall time: 3min 54s


%%time
clf = LGBMClassifier(random_state=42,verbosity=-1)
study_lgb_small = ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_lgb_small.build_pipeline()
params = {'learning_rate':ss.loguniform(0.05,0.3),
          'n_estimators':ss.randint(50,200),
          'max_depth':list(range(2,4))+[None],
          'reg_alpha':ss.loguniform(1,5),
          'reg_lambda':ss.loguniform(1,5),
          'min_child_samples':ss.randint(20,100),
          }
study_lgb_small.randomized_search(params, n_iter=200, refit=True)

CPU times: user 8.32 s, sys: 1.07 s, total: 9.39 s
Wall time: 3min 41s


%%time
clf = HistGradientBoostingClassifier(early_stopping=True,max_iter=2000,
                                     n_iter_no_change=50,random_state=42)
study_hgb = ClassifierStudy(clf,X_train,y_train,feat_dict['all'])
study_hgb.build_pipeline()
params = {'learning_rate':ss.loguniform(0.005,0.2),
          'max_depth':list(range(2,10))+[None],
          'l2_regularization':ss.loguniform(1,12),
          'min_samples_leaf':ss.randint(20,200),
          }
study_hgb.randomized_search(params, n_iter=50, refit=True)

CPU times: user 46.2 s, sys: 14.6 s, total: 1min
Wall time: 6min 12s


%%time
clf = HistGradientBoostingClassifier(early_stopping=True,max_iter=2000,
                                     n_iter_no_change=50,random_state=42)
study_hgb_small = ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_hgb_small.build_pipeline()
params = {'learning_rate':ss.loguniform(0.005,0.2),
          'max_depth':list(range(2,10))+[None],
          'l2_regularization':ss.loguniform(1,12),
          'min_samples_leaf':ss.randint(20,200),
          }
study_hgb_small.randomized_search(params, n_iter=50, refit=True)

CPU times: user 7.59 s, sys: 1.67 s, total: 9.26 s
Wall time: 4min 31s


%%time
clf = GradientBoostingClassifier(n_estimators = 2000, n_iter_no_change=50,random_state=42)
study_gb = ClassifierStudy(clf,X_train,y_train,feat_dict['all'])
study_gb.build_pipeline()
params = {'learning_rate':ss.loguniform(0.005,0.2),
          'max_depth':list(range(2,10))+[None],
          'subsample':ss.uniform(0.5,1),
          'min_samples_leaf':ss.randint(20,200),
          }
study_gb.randomized_search(params, n_iter = 50, refit=True)

CPU times: user 13.2 s, sys: 757 ms, total: 14 s
Wall time: 10min 32s


%%time
clf = GradientBoostingClassifier(n_estimators = 2000, n_iter_no_change=50,random_state=42)
study_gb_small = ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
study_gb_small.build_pipeline()
params = {'learning_rate':ss.loguniform(0.005,0.2),
          'max_depth':list(range(2,10))+[None],
          'subsample':ss.uniform(0.5,1),
          'min_samples_leaf':ss.randint(20,200),
          }
study_gb_small.randomized_search(params, n_iter = 50, refit=True)

CPU times: user 10.8 s, sys: 429 ms, total: 11.3 s
Wall time: 7min 11s


best_params = {'all': {'gb': {'learning_rate': 0.03130343103935811,
   'max_depth': 2,
   'min_samples_leaf': 120,
   'subsample': 0.7848404943774676},
  'hgb': {'l2_regularization': 11.515738930806396,
   'learning_rate': 0.021772619713840338,
   'max_depth': 2,
   'min_samples_leaf': 143},
  'lgb': {'learning_rate': 0.05409453242074271,
  'max_depth': 4,
 'min_child_samples': 154,
 'n_estimators': 198,
 'reg_alpha': 2.2983907150108998,
 'reg_lambda': 1.413513006153221}},
 'lr_small': {'gb': {'learning_rate': 0.03130343103935811,
   'max_depth': 2,
   'min_samples_leaf': 120,
   'subsample': 0.7848404943774676},
  'hgb': {'l2_regularization': 2.4238734679222236,
   'learning_rate': 0.14182851952262965,
   'max_depth': 2,
   'min_samples_leaf': 140},
  'lgb': {'learning_rate': 0.12436979645979512,
 'max_depth': 3,
 'min_child_samples': 42,
 'n_estimators': 192,
 'reg_alpha': 3.373708711638402,
 'reg_lambda': 1.4451837004705437}}}


best_params = {}
best_params['all'] = {
    'gb':study_gb.best_params,
    'hgb':study_hgb.best_params,
    'lgb':study_lgb.best_params
}
best_params['lr_small'] = {
    'gb':study_gb_small.best_params,
    'hgb':study_hgb_small.best_params,
    'lgb':study_lgb_small.best_params 
}
best_params

{'all': {'gb': {'learning_rate': 0.03130343103935811,
   'max_depth': 2,
   'min_samples_leaf': 120,
   'subsample': 0.7848404943774676},
  'hgb': {'l2_regularization': 11.515738930806396,
   'learning_rate': 0.021772619713840338,
   'max_depth': 2,
   'min_samples_leaf': 143},
  'lgb': {'learning_rate': 0.19669065259669294,
   'max_depth': 2,
   'min_child_samples': 26,
   'n_estimators': 193,
   'reg_alpha': 5.707852486890183,
   'reg_lambda': 1.1981866373949588}},
 'lr_small': {'gb': {'learning_rate': 0.03130343103935811,
   'max_depth': 2,
   'min_samples_leaf': 120,
   'subsample': 0.7848404943774676},
  'hgb': {'l2_regularization': 2.4238734679222236,
   'learning_rate': 0.14182851952262965,
   'max_depth': 2,
   'min_samples_leaf': 140},
  'lgb': {'learning_rate': 0.12656361513845585,
   'max_depth': 3,
   'min_child_samples': 36,
   'n_estimators': 189,
   'reg_alpha': 3.070386836762345,
   'reg_lambda': 3.097528444873408}}}


tuned_scores = {}
tuned_scores['all'] = [
    study_gb.best_score,
    study_hgb.best_score,
    study_lgb.best_score
]
tuned_scores['lr_small'] = [
    study_gb_small.best_score,
    study_hgb_small.best_score,
    study_lgb_small.best_score 
]

tuned_scores = pd.DataFrame(tuned_scores,index = ['GBC','HGBC','LGBMC'])\
    .rename(columns={'all':'all features',
                    'lr_small':'small feature set'})

tuned_scores


lgb_params = best_params['lr_small']['lgb']


%%time
n_trials = 100
thresh_list = []
for i in range(n_trials):
    clf = LGBMClassifier(random_state=42*i,**lgb_params,verbosity=-1)
    study_lgb =ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'],random_state=42*i)
    study_lgb.build_pipeline()
    study_lgb.fit_pipeline(split_first=True)
    study_lgb.predict_proba_pipeline()
    study_lgb.find_best_threshold(beta=3,conf=False,report=False,print_result=False)
    thresh_list.append(study_lgb.best_thresh)
lgb_thresh = np.mean(thresh_list)
print(f'The average best threshold for F_3 over {n_trials} trials is {lgb_thresh}')

The average best threshold for F_3 over 100 trials is 0.051350362708522494
CPU times: user 55.7 s, sys: 11.3 s, total: 1min 6s
Wall time: 52.1 s


%%time
scale_pos_weight = (len(y_train)-sum(y_train))/sum(y_train)
n_trials = 100
thresh_list = []
for i in range(n_trials):
    clf = LGBMClassifier(random_state=42*i,scale_pos_weight = scale_pos_weight,**lgb_params,verbosity=-1)
    study_lgb_bal =ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'],random_state=42*i)
    study_lgb_bal.build_pipeline()
    study_lgb_bal.fit_pipeline(split_first=True)
    study_lgb_bal.predict_proba_pipeline()
    study_lgb_bal.find_best_threshold(beta=3,conf=False,report=False,print_result=False)
    thresh_list.append(study_lgb_bal.best_thresh)
lgb_bal_thresh = np.mean(thresh_list)
print(f'The average best threshold for F_3 over {n_trials} trials is {lgb_bal_thresh}')

The average best threshold for F_3 over 100 trials is 0.3612220261352673
CPU times: user 56.1 s, sys: 11 s, total: 1min 7s
Wall time: 52 s


lgb_thresh = 0.051350362708522494
lgb_bal_thresh = 0.3612220261352673


def cv_scores_dict(clf,thresh):
    """
    Builds a pipeline ending with the input classifier clf
    and computes a mean CV score using a variety of metrics
    Parameters:
    -----------
    clf : sklearn compatible binary classifier
    thresh : float
        the classification threshold to use when
        computing y_pred from y_pred_proba
    Returns:
    -------
    scores_dict: dict
        keys are scoring metric keywords and values are
        corresponding model scores on test set    
    """
    study = ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
    study.build_pipeline()
    scores_dict = {}
    for scoring in ['roc_auc','fb','f1w','acc']:
        scores_dict[scoring] = [study.cv_score(scoring=scoring,beta=3,thresh=thresh,
                                              return_mean_score=True, print_mean_score=False)]
    return scores_dict


%%time
clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',
                         class_weight='balanced',**lr_params)
lr_cv = cv_scores_dict(clf,lr_thresh)

CPU times: user 3min 32s, sys: 16.7 s, total: 3min 48s
Wall time: 31.8 s


%%time
clf = LGBMClassifier(random_state=42,**lgb_params,verbosity=-1)
lgb_cv = cv_scores_dict(clf,lgb_thresh)

CPU times: user 34 s, sys: 6.52 s, total: 40.5 s
Wall time: 30.7 s


%%time
clf = LGBMClassifier(random_state=42,scale_pos_weight=scale_pos_weight,**lgb_params,verbosity=-1)
lgb_bal_cv = cv_scores_dict(clf,lgb_bal_thresh)

CPU times: user 33.4 s, sys: 6.36 s, total: 39.8 s
Wall time: 30.7 s


# Define table styles
styles = [dict(selector="caption",
                props=[("text-align", "center"),
                        ("font-size", "100%"),
                        ("color", 'black'),
                        ("text-decoration","underline"),
                        ("font-weight","bold")])]
    
cv_scores = pd.concat([pd.DataFrame(lr_cv,index=['Optimal LogisticRegression model']),
                       pd.DataFrame(lgb_cv,index=['Optimal LGBMClassifier model']),
                      pd.DataFrame(lgb_bal_cv,index=['Optimal LGBMClassifier model, balanced'])],axis=0)\
            .rename(columns = {'roc_auc':'ROC AUC score','fb': 'F_beta score (beta=3)',
                               'f1w':'weighted average F_1 score','acc':'accuracy score'})
cv_scores.style.set_caption('Comparison of model CV scores (n_splits=5, n_repeats = 3)')\
                .set_table_styles(styles)


%%time
def test_scores_dict(clf,thresh):
    """
    Builds a pipeline ending with the input classifier clf
    and scores it on the test set using a variety of metrics
    Parameters:
    -----------
    clf : sklearn compatible binary classifier
    thresh : float
        the classification threshold to use when
        computing y_pred from y_pred_proba
    Returns:
    -------
    (study,scores_dict) : tuple
        study : ClassiferStudy instance
            with pipeline build and fitted and predictions made
        scores_dict: dict
            keys are scoring metric keywords and values are
            corresponding model scores on test set    
    """
    study = ClassifierStudy(clf,X_train,y_train,feat_dict['lr_small'])
    study.build_pipeline()
    study.fit_pipeline()
    study.predict_proba_pipeline(X_test)
    scores_dict = {}
    for scoring in ['roc_auc','fb','f1w','acc']:
        scores_dict[scoring] = [study.score_pipeline(y_test,scoring=scoring,beta=3,
                                                     thresh=thresh,print_score=False)]
    return (study,scores_dict)

clf = LogisticRegression(max_iter=1000,solver='newton-cholesky',
                         class_weight='balanced',**lr_params)
study_lr,lr_test = test_scores_dict(clf,lr_thresh)

clf = LGBMClassifier(random_state=42,**lgb_params,verbosity=-1)
study_lgb,lgb_test = test_scores_dict(clf,lgb_thresh)

test_scores = pd.concat([pd.DataFrame(lr_test,index=['Optimal LogisticRegression model']),
                       pd.DataFrame(lgb_test,index=['Optimal LGBMClassifier model'])],axis=0)\
            .rename(columns = {'roc_auc':'ROC AUC score','fb': 'F_beta score (beta=3)',
                               'f1w':'weighted average F_1 score','acc':'accuracy score'})
test_scores.style.set_caption('Comparison of model scores on holdout test set')\
                .set_table_styles(styles)

CPU times: user 2.56 s, sys: 2.51 s, total: 5.06 s
Wall time: 1.24 s


fig, axs = plt.subplots(1,2,figsize=(8,4))
RocCurveDisplay.from_predictions(y_test,study_lr.y_predict_proba,
                                 name='LR',ax=axs[0])
axs[0].set_title('ROC Curve for LR model',fontsize='medium')
RocCurveDisplay.from_predictions(y_test,study_lgb.y_predict_proba,
                                 name='BikeSaferPA',ax=axs[1])
axs[1].set_title('ROC Curve for BikeSaferPA',fontsize='medium')
for ax in axs:
    ax.tick_params(axis='x', labelsize='x-small')
    ax.tick_params(axis='y', labelsize='x-small')
    ax.set_ylabel('True positive rate',fontsize='small')
    ax.set_xlabel('False positive rate',fontsize='small')
plt.tight_layout()
plt.show()


fig, axs = plt.subplots(1,2,figsize=(6,3))
ConfusionMatrixDisplay.from_predictions(y_test,(study_lr.y_predict_proba >= lr_thresh).astype(int),
                                        ax=axs[0],colorbar=False)
axs[0].set_title('Confusion Matrix for LR model',fontsize='small')
ConfusionMatrixDisplay.from_predictions(y_test,(study_lgb.y_predict_proba >= lgb_thresh).astype(int),
                                       ax=axs[1],colorbar=False)
axs[1].set_title('Confusion Matrix for BikeSaferPA',fontsize='small')
for ax in axs:
    ax.tick_params(axis='x', labelsize='x-small')
    ax.tick_params(axis='y', labelsize='x-small')
    ax.set_ylabel('True label',fontsize='small')
    ax.set_xlabel('Predicted label',fontsize='small')
plt.tight_layout()
plt.show()


study_lr.score_pipeline(y_test,scoring='classif_report',thresh=lr_thresh)

                                      precision    recall  f1-score   support

neither seriously injured nor killed       0.97      0.51      0.67      4988
         seriously injured or killed       0.11      0.76      0.19       389

                            accuracy                           0.53      5377
                           macro avg       0.54      0.64      0.43      5377
                        weighted avg       0.90      0.53      0.64      5377


study_lgb.score_pipeline(y_test,scoring='classif_report',thresh=lgb_thresh)

                                      precision    recall  f1-score   support

neither seriously injured nor killed       0.97      0.56      0.71      4988
         seriously injured or killed       0.12      0.75      0.20       389

                            accuracy                           0.58      5377
                           macro avg       0.54      0.66      0.46      5377
                        weighted avg       0.90      0.58      0.67      5377


!pip install numpy==1.23.0

Requirement already satisfied: numpy==1.23.0 in /Users/eamonn/mambaforge/envs/ds/lib/python3.10/site-packages (1.23.0)


%%time
clf = LGBMClassifier(random_state=42,**lgb_params,verbosity=-1)
study_lgb = ClassifierStudy(clf,X_train,y_train,features=feat_dict['lr_small'])
study_lgb.build_pipeline(cyc_method=None,num_ss=False)
study_lgb.shap_values(X_test)
study_lgb.shap_plot(max_display=20)

CPU times: user 8.55 s, sys: 171 ms, total: 8.73 s
Wall time: 7.38 s


fig, axs = plt.subplots(1,3,figsize=(12,3))
for i,feat in enumerate(['AGE','SPEED_LIMIT','CRASH_YEAR']):
    shap.plots.scatter(study_lgb.shap_vals[:,feat],show=False,ax=axs[i],dot_size=4)
    axs[i].set_xlabel(feat, fontsize='x-small')
    axs[i].set_ylabel(f'SHAP value for {feat}',fontsize='x-small')
plt.tight_layout()
plt.show()


fig, axs = plt.subplots(1,2,figsize=(12,4))
for i,feat in enumerate(['AGE','SPEED_LIMIT']):
    shap.plots.scatter(study_lgb.shap_vals[:,feat],color=study_lgb.shap_vals[:,'RESTRAINT_HELMET_bicycle_helmet'],show=False,ax=axs[i],dot_size=4,alpha=0.5)
    axs[i].set_xlabel(feat, fontsize='x-small')
    axs[i].set_ylabel(f'SHAP value for {feat}',fontsize='x-small')
plt.tight_layout()
plt.show()

	coeff value
feature name
SPEEDING_RELATED	0.875896
COMM_VEH_COUNT	0.839971
HEAVY_TRUCK_COUNT	0.781875
DRUG_RELATED	0.764003
HILL	0.621782
ALCOHOL_RELATED	0.573973
SMALL_TRUCK_COUNT	0.453031
RUNNING_RED_LT	0.446169
IMPACT_SIDE_front_left	0.441555
VEH_ROLE_striking_struck	0.399137
CURVED_ROAD	0.393789
DRINKING_DRIVER	0.381857
URBAN_RURAL_rural	0.334569
VAN_COUNT	0.297721
SUV_COUNT	0.296763
ILLUMINATION_dark_unlit	0.281212
ILLUMINATION_dawn	0.278546
COLLISION_TYPE_head_on	0.271771
FEMALE	-0.257571
ILLUMINATION_dusk	-0.261101
VEH_ROLE_striking	-0.266743
URBAN_RURAL_urban	-0.350611
COLLISION_TYPE_sideswipe_opp_dir	-0.370684
AGGRESSIVE_DRIVING	-0.401452
IMPACT_SIDE_front_right	-0.420935
RESTRAINT_HELMET_unknown	-0.529896
COLLISION_TYPE_sideswipe_same_dir	-0.612924
NO_CLEARANCE	-0.741798

	mean cv score (roc_auc)
pca__n_components
76	0.738291
61	0.738240
69	0.738231
74	0.738227
79	0.738211
77	0.738211
84	0.738211
83	0.738211
80	0.738211
85	0.738211

	mean cv score (roc_auc)
LR_clf__C
0.052738	0.738320
0.052730	0.738319
0.028694	0.738043
0.205070	0.737860
0.825641	0.737017
0.838342	0.737009
1.629659	0.736703
1.890861	0.736649
4.353248	0.736486
7.361722	0.736436

		mean cv score (roc_auc)
LR_clf__C	LR_clf__l1_ratio
0.061911	0.183405	0.738193
0.205070	0.950714	0.738190
0.052738	0.155995	0.738182
0.122891	0.366362	0.738181
0.114621	0.542696	0.738178
0.138154	0.325183	0.738176
0.132486	0.524756	0.738159
0.138781	0.520068	0.738148
0.132792	0.097672	0.738131
0.156356	0.063558	0.738040

	all features	small feature set
GBC	0.737861	0.739843
HGBC	0.734066	0.732187
LGBM	0.733911	0.734358

						mean cv score (roc_auc)
clf__learning_rate	clf__max_depth	clf__min_child_samples	clf__n_estimators	clf__reg_alpha	clf__reg_lambda
0.054095	4.000000	154	198	2.298391	1.413513	0.746978
0.361816	2.000000	93	76	3.029152	1.584134	0.746602
0.196691	2.000000	26	193	5.707852	1.198187	0.746079
0.091231	5.000000	152	139	1.058877	5.045954	0.745935
0.092152	3.000000	63	181	2.669554	10.000757	0.745788
0.096562	4.000000	130	170	1.374626	1.458581	0.745520
0.087464	7.000000	112	95	4.081847	1.483229	0.745419
0.082637	4.000000	100	128	1.468653	4.706400	0.745365
0.187748	3.000000	57	134	1.826710	1.952371	0.745265
0.358340	2.000000	48	82	1.993273	1.727925	0.745239

				mean cv score (roc_auc)
clf__l2_regularization	clf__learning_rate	clf__max_depth	clf__min_samples_leaf
11.515739	0.021773	2.000000	143	0.743552
2.423873	0.141829	2.000000	140	0.742604
1.869471	0.031303	2.000000	120	0.742459
5.360337	0.075306	3.000000	170	0.742266
5.422195	0.026345	3.000000	151	0.742035
5.000058	0.005010	nan	22	0.741564
3.890447	0.009888	3.000000	63	0.741419
9.714206	0.115022	3.000000	52	0.741244
2.247857	0.011267	4.000000	166	0.740723
3.588824	0.044469	4.000000	70	0.740503

	all features	small feature set
GBC	0.747300	0.747868
HGBC	0.743552	0.743990
LGBMC	0.746978	0.748165

	ROC AUC score	F_beta score (beta=3)	weighted average F_1 score	accuracy score
Optimal LogisticRegression model	0.737163	0.500303	0.650726	0.549175
Optimal LGBMClassifier model	0.746698	0.502195	0.688262	0.592870
Optimal LGBMClassifier model, balanced	0.737929	0.500185	0.660113	0.559854

	ROC AUC score	F_beta score (beta=3)	weighted average F_1 score	accuracy score
Optimal LogisticRegression model	0.713936	0.475655	0.635683	0.531523
Optimal LGBMClassifier model	0.718382	0.483844	0.682742	0.585829

						mean cv score (roc_auc)
clf__learning_rate	clf__max_depth	clf__min_child_samples	clf__n_estimators	clf__reg_alpha	clf__reg_lambda
0.124370	3.000000	42	192	3.373709	1.445184	0.748165
0.179153	2.000000	75	130	1.536285	4.815313	0.746769
0.068086	3.000000	70	196	3.753246	1.742619	0.746754
0.196413	2.000000	93	169	4.883624	2.134325	0.746580
0.234433	2.000000	48	181	2.094448	3.877900	0.746553
0.087403	3.000000	73	155	2.493478	1.051688	0.746528
0.216218	2.000000	79	162	1.475868	1.161652	0.746509
0.190305	3.000000	97	166	3.242309	4.527639	0.746481
0.154016	2.000000	27	193	1.085140	4.165995	0.746453
0.156615	2.000000	31	192	3.051467	1.086586	0.746408

				mean cv score (roc_auc)
clf__learning_rate	clf__max_depth	clf__min_samples_leaf	clf__subsample
0.031303	2.000000	120	0.784840	0.747300
0.011267	4.000000	166	0.848666	0.745177
0.006046	6.000000	148	0.739562	0.744191
0.036629	5.000000	32	0.576980	0.743771
0.009835	7.000000	108	0.791229	0.743310
0.065118	4.000000	167	0.825959	0.742898
0.007780	9.000000	78	0.823203	0.741206
0.006738	9.000000	142	0.742160	0.741153
0.016542	8.000000	111	0.856298	0.741101
0.019907	9.000000	40	0.656019	0.740766

				mean cv score (roc_auc)
clf__learning_rate	clf__max_depth	clf__min_samples_leaf	clf__subsample
0.031303	2.000000	120	0.784840	0.747868
0.011267	4.000000	166	0.848666	0.745586
0.006046	6.000000	148	0.739562	0.744704
0.065118	4.000000	167	0.825959	0.744684
0.036629	5.000000	32	0.576980	0.743293
0.009835	7.000000	108	0.791229	0.743057
0.006738	9.000000	142	0.742160	0.742037
0.016542	8.000000	111	0.856298	0.741965
0.007780	9.000000	78	0.823203	0.741028
0.019907	9.000000	40	0.656019	0.740210

Part III: Building the BikeSaferPA predictive model¶

Navigation:¶

Data and features¶

Feature descriptions¶

Missing values¶

Dataframe preparation¶

Transformations for periodic features¶

Scoring metrics¶

Logistic regression models¶

Custom class for constructing, fitting, and evaluating classifier pipelines¶

Choosing how to encode periodic features¶

Feature selection via Logistic Regression model coefficients¶

Newton-Cholesky solver (L2-regularization only)¶

SAGA solver (L1-regularization only)¶

Five-fold repeated cross-validation with baseline models¶

Newton-Cholesky solver¶

SAGA solver with L1-regularization¶

Dimensionality reduction with PCA¶

Fine tuning hyperparameters¶

Newton-Cholesky¶

SAGA with Elastic-Net¶

$F_{\beta}$ scores - balancing precision and recall to select a classification threshhold¶

Gradient boosted tree classifiers¶

Baseline CV scores for GBC and HGBC¶

Tuning hyperparameters for gradient boosted decision tree models¶

LGBM with full feature set¶

LGBM with small feature set¶

HGB with full feature set¶

HGB with small feature set¶

GBC with full feature set¶

GBC with small feature set¶

$F_{\beta}$ and the classification threshold¶

Selecting the BikeSaferPA model¶

Evaluating model performance on the holdout set¶

Explaining BikeSaferPA's predictions using SHAP values¶

SHAP plots¶

Mean absolute SHAP values and distributions of SHAP values across samples¶

Discussing of feature importance from SHAP values¶

Numerical features¶

Ordinal features¶

Binary features¶

One-hot features from categorical features¶

Additional comments:¶

Summary of model design process and results¶

Feature selection¶

Feature encoding¶

Hyperparameter tuning¶

Selecting the BikeSaferPA model¶

Reflections on BikeSaferPA's performance¶

Interpreting BikeSaferPA based on SHAP values¶

Policy recommendations based on BikeSaferPA results¶