import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

from pathlib import Path


pd.set_option("display.max_columns",200)
pd.set_option("display.max_rows",200)

df = pd.read_csv("../data/processed/df_model_attrition.csv")

df

X = df.drop(columns=["a_quitte_l_entreprise"]) 
y = df["a_quitte_l_entreprise"]

#définir les colonnes numériques vs catégorielles
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

numeric_features

['age',
 'revenu_mensuel',
 'nombre_experiences_precedentes',
 'annees_dans_l_entreprise',
 'annees_dans_le_poste_actuel',
 'satisfaction_employee_environnement',
 'niveau_hierarchique_poste',
 'satisfaction_employee_nature_travail',
 'satisfaction_employee_equipe',
 'satisfaction_employee_equilibre_pro_perso',
 'note_evaluation_actuelle',
 'augmentation_salaire_precedente_pct',
 'heures_supplementaires',
 'distance_domicile_travail',
 'niveau_education',
 'annees_depuis_la_derniere_promotion',
 'annes_sous_responsable_actuel',
 'a_suivi_formation',
 'annee_experience_avant_entreprise',
 'mobilite_interne',
 'evolution_note',
 'utilisation_pee']

numeric_features = [
    'age',
    'revenu_mensuel',
    'nombre_experiences_precedentes',
    'annees_dans_l_entreprise',
    'annees_dans_le_poste_actuel',
    'satisfaction_employee_environnement',
    'niveau_hierarchique_poste',
    'satisfaction_employee_nature_travail',
    'satisfaction_employee_equipe',
    'satisfaction_employee_equilibre_pro_perso',
    'note_evaluation_actuelle',
    'augmentation_salaire_precedente_pct',
    'distance_domicile_travail',
    'niveau_education',
    'annees_depuis_la_derniere_promotion',
    'annes_sous_responsable_actuel',
    'annee_experience_avant_entreprise',
    'evolution_note'
]

binary_features = [
    'utilisation_pee',
    'mobilite_interne',
    'a_suivi_formation',
    'heures_supplementaires'
]

categorical_nominal_features=[
    'genre',
    'statut_marital',
    'departement',
    'poste',
    'domaine_etude'
]

categorical_ordinal_text_features=[
    'frequence_deplacement'
]

df[numeric_features]

corr_pearson = X[numeric_features].corr(method="pearson")

plt.figure(figsize=(12,10))

sns.heatmap(
    corr_pearson,
    annot=False,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8}
)

plt.title("Matrice de corrélation - Pearson")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.show()

X = X.drop(columns=["niveau_hierarchique_poste"],errors="ignore")
numeric_features.remove("niveau_hierarchique_poste")

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline

df["frequence_deplacement"].value_counts()

frequence_deplacement
Occasionnel    1043
Frequent        277
Aucun           150
Name: count, dtype: int64

#définition de l'ordinal encoder (ordre métier)
ordinal_encoder = OrdinalEncoder(
    categories=[["Aucun", "Occasionnel", "Frequent"]],
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

# --- Préprocesseur pour modèle linéaire (scaling utile) ---
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat_nom", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_nominal_features),
        ("cat_ord", ordinal_encoder, categorical_ordinal_text_features),
    ],
    remainder="drop"
)

# --- Préprocesseur pour modèles arbres (pas de scaling) ---
preprocessor_noscale = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat_nom", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_nominal_features),
        ("cat_ord", ordinal_encoder, categorical_ordinal_text_features),
    ],
    remainder="drop"
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y 
)

from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

pipeline_dummy = Pipeline([
    ("model", DummyClassifier(
        strategy="most_frequent", #il ne détecte jamais les départs 
        random_state=42))
])

pipeline_logreg = Pipeline([
    ("preprocessing", preprocessor_scaled),
    ("model", LogisticRegression(
        max_iter=2000, 
        class_weight="balanced", 
        random_state=42))
])

pipeline_rf = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced"
    ))
])

n_neg = np.sum(y_train == 0)
n_pos = np.sum(y_train == 1)

ratio = n_neg / n_pos

print(ratio)

5.189473684210526

pipeline_xgb = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=ratio,  
        random_state=42,
        eval_metric="logloss"
    ))
])

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay, fbeta_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

beta = 2 # >1 => favorise le recall

scoring = {
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "f1": make_scorer(f1_score, zero_division=0),
    "fbeta": make_scorer(fbeta_score, beta=beta, zero_division=0),
    "roc_auc": "roc_auc"
}

def cv_scores(model):
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
    results = {}
    for m in scoring.keys():
        results[m] = scores[f"test_{m}"].mean() #pour évaluer la performance globale du modèle
        results[m + "_std"] = scores[f"test_{m}"].std() #pour évaluer la stabilité du modèle
    return results

print("Dummy:", cv_scores(pipeline_dummy))
print("LogReg:", cv_scores(pipeline_logreg))
print("RF:", cv_scores(pipeline_rf))
print("XGBoost:", cv_scores(pipeline_xgb))

Dummy: {'precision': np.float64(0.0), 'precision_std': np.float64(0.0), 'recall': np.float64(0.0), 'recall_std': np.float64(0.0), 'f1': np.float64(0.0), 'f1_std': np.float64(0.0), 'fbeta': np.float64(0.0), 'fbeta_std': np.float64(0.0), 'roc_auc': np.float64(0.5), 'roc_auc_std': np.float64(0.0)}
LogReg: {'precision': np.float64(0.3930591859320055), 'precision_std': np.float64(0.023370161205112826), 'recall': np.float64(0.7526315789473684), 'recall_std': np.float64(0.04275809686650506), 'f1': np.float64(0.5150246492484409), 'f1_std': np.float64(0.012761473228715145), 'fbeta': np.float64(0.6345657554502264), 'fbeta_std': np.float64(0.015834743504363192), 'roc_auc': np.float64(0.8404279233694141), 'roc_auc_std': np.float64(0.030354764435728203)}
RF: {'precision': np.float64(0.8778571428571429), 'precision_std': np.float64(0.1119493325626575), 'recall': np.float64(0.16842105263157894), 'recall_std': np.float64(0.05907880084379907), 'f1': np.float64(0.27531940478350997), 'f1_std': np.float64(0.07646872191816814), 'fbeta': np.float64(0.19913485373552114), 'fbeta_std': np.float64(0.06545565106876665), 'roc_auc': np.float64(0.8183284892742547), 'roc_auc_std': np.float64(0.027841056704140967)}
XGBoost: {'precision': np.float64(0.662748538011696), 'precision_std': np.float64(0.06334624047337861), 'recall': np.float64(0.4789473684210527), 'recall_std': np.float64(0.07330730672202168), 'f1': np.float64(0.5509507189724976), 'f1_std': np.float64(0.056282096253104166), 'fbeta': np.float64(0.5045434653282551), 'fbeta_std': np.float64(0.06628179728626574), 'roc_auc': np.float64(0.8223266595962295), 'roc_auc_std': np.float64(0.03417148219178669)}

pipeline_logreg.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'revenu_mensuel',
                                                   'nombre_experiences_precedentes',
                                                   'annees_dans_l_entreprise',
                                                   'annees_dans_le_poste_actuel',
                                                   'satisfaction_employee_environnement',
                                                   'satisfaction_employee_nature_travail',
                                                   'satisfaction_employee_equipe',
                                                   'satisfaction_employee_equilibre_pro_pe...
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['genre', 'statut_marital',
                                                   'departement', 'poste',
                                                   'domaine_etude']),
                                                 ('cat_ord',
                                                  OrdinalEncoder(categories=[['Aucun',
                                                                              'Occasionnel',
                                                                              'Frequent']],
                                                                 handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['frequence_deplacement'])])),
                ('model',
                 LogisticRegression(class_weight='balanced', max_iter=2000,
                                    random_state=42))])

['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_l_entreprise', 'annees_dans_le_poste_actuel', 'satisfaction_employee_environnement', 'satisfaction_employee_nature_travail', 'satisfaction_employee_equipe', 'satisfaction_employee_equilibre_pro_perso', 'note_evaluation_actuelle', 'augmentation_salaire_precedente_pct', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'annee_experience_avant_entreprise', 'evolution_note']

['utilisation_pee', 'mobilite_interne', 'a_suivi_formation', 'heures_supplementaires']

passthrough

['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']

['frequence_deplacement']

y_pred = pipeline_logreg.predict(X_test)
y_proba = pipeline_logreg.predict_proba(X_test)[:, 1]  # probabilité de la classe 1 (départ)

precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba)

print("=== Métriques sur TEST ===")
print(f"Precision : {precision:.3f}")
print(f"Recall    : {recall:.3f}")
print(f"F1-score  : {f1:.3f}")
print(f"Fbeta_score : {fbeta:.3f}")
print(f"ROC-AUC   : {roc_auc:.3f}")

=== Métriques sur TEST ===

Precision : 0.364
Recall    : 0.681
F1-score  : 0.474
Fbeta_score : 0.580
ROC-AUC   : 0.830

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Resté (0)", "Parti (1)"])
disp.plot(values_format="d")
plt.title("Matrice de confusion - Logistic Regression")
plt.show()

print("\n=== Rapport de classification (TEST) ===")
print(classification_report(y_test, y_pred, target_names=["Resté (0)", "Parti (1)"], zero_division=0))

=== Rapport de classification (TEST) ===
              precision    recall  f1-score   support

   Resté (0)       0.93      0.77      0.84       247
   Parti (1)       0.36      0.68      0.47        47

    accuracy                           0.76       294
   macro avg       0.65      0.73      0.66       294
weighted avg       0.84      0.76      0.78       294

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Courbe ROC - Logistic Regression")
plt.show()

# ===== TRAIN =====
proba_train = pipeline_logreg.predict_proba(X_train)[:, 1]
y_pred_train = (proba_train >= 0.5).astype(int)

print("=== LOGREG TRAIN ===")
print("Precision:", precision_score(y_train, y_pred_train))
print("Recall   :", recall_score(y_train, y_pred_train))
print("F1-score :", f1_score(y_train, y_pred_train))
print("ROC-AUC  :", roc_auc_score(y_train, proba_train))

# ===== TEST =====
proba_test = pipeline_logreg.predict_proba(X_test)[:, 1]
y_pred_test = (proba_test >= 0.5).astype(int)

print("\n=== LOGREG TEST ===")
print("Precision:", precision_score(y_test, y_pred_test))
print("Recall   :", recall_score(y_test, y_pred_test))
print("F1-score :", f1_score(y_test, y_pred_test))
print("ROC-AUC  :", roc_auc_score(y_test, proba_test))

=== LOGREG TRAIN ===

Precision: 0.42005420054200543
Recall   : 0.8157894736842105
F1-score : 0.554561717352415
ROC-AUC  : 0.8780559410697129

=== LOGREG TEST ===
Precision: 0.36363636363636365
Recall   : 0.6808510638297872
F1-score : 0.4740740740740741
ROC-AUC  : 0.8298733741062969

from sklearn.model_selection import GridSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = [
    # L1 (=> solver compatible)
    {"model__solver": ["liblinear", "saga"],
     "model__penalty": ["l1"],
     "model__C": [0.01, 0.1, 0.5, 1, 2, 5]},
    
    # L2 (=> solvers compatibles)
    {"model__solver": ["liblinear", "lbfgs", "saga"],
     "model__penalty": ["l2"],
     "model__C": [0.01, 0.1, 0.5, 1, 2, 5]},
]

grid = GridSearchCV(
    estimator=pipeline_logreg, 
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)
best_logreg = grid.best_estimator_

Best params: {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best CV F1: 0.5225420936948655

c:\Users\bouzo\Desktop\EngineerIA\Projet4_IA_14022026\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
  warnings.warn(

proba_train = best_logreg.predict_proba(X_train)[:, 1]

thresholds = np.linspace(0.1, 0.9, 81)
precisions, recalls, f1s = [], [], []

for t in thresholds:
    y_pred_t = (proba_train >= t).astype(int)
    precisions.append(precision_score(y_train, y_pred_t, zero_division=0))
    recalls.append(recall_score(y_train, y_pred_t, zero_division=0))
    f1s.append(f1_score(y_train, y_pred_t, zero_division=0))

best_idx_2 = int(np.argmax(f1s))
best_threshold_2 = thresholds[best_idx_2]
print("Best threshold (train):", best_threshold_2, "Best F1:", f1s[best_idx_2])

Best threshold (train): 0.64 Best F1: 0.6466165413533834

plt.figure(figsize=(8,5))

plt.plot(thresholds, precisions, label="Precision")
plt.plot(thresholds, recalls, label="Recall")
plt.plot(thresholds, f1s, label="F1-score")

plt.axvline(0.5, color="gray", linestyle="--", label="Seuil 0.5")

plt.xlabel("Seuil de décision")
plt.ylabel("Score")
plt.title("Impact du seuil sur les métriques")
plt.legend()
plt.grid()
plt.show()

from sklearn.metrics import precision_recall_curve

#proba_train = best_logreg.predict_proba(X_train)[:, 1]

#calcul la PR curve
precisions, recalls, thresholds = precision_recall_curve(y_train, proba_train)

#calcul F1 pour chaque seuil 
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)

#trouvons le meilleur f1 score 
best_idx = np.argmax(f1_scores[:-1])  # on enlève le dernier point

best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print("Best threshold (PR curve) :", best_threshold)
print("Best F1 :", best_f1)

#graph
plt.figure(figsize=(6,5))
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.scatter(recalls[best_idx], precisions[best_idx], color="red")
plt.show()

Best threshold (PR curve) : 0.6376779826009066
Best F1 : 0.6502463053689244

proba_test = best_logreg.predict_proba(X_test)[:, 1]
y_pred_test = (proba_test >= best_threshold).astype(int)

print("=== TEST metrics ===")
print("Precision:", precision_score(y_test, y_pred_test, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_test, zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_test, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_test, proba_test))

cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
plt.title(f"Confusion Matrix (TEST) - LogReg (seuil={best_threshold:.3f})")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

print("\n=== Classification report (TEST) ===")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("ROC-AUC:", roc_auc_score(y_test, proba_test))

=== TEST metrics ===
Precision: 0.5370370370370371
Recall   : 0.6170212765957447
F1-score : 0.5742574257425742
ROC-AUC  : 0.8338358170385045

=== Classification report (TEST) ===
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       247
           1       0.54      0.62      0.57        47

    accuracy                           0.85       294
   macro avg       0.73      0.76      0.74       294
weighted avg       0.86      0.85      0.86       294

ROC-AUC: 0.8338358170385045

from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

# Calibration curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, proba_test, n_bins=10
)

plt.figure(figsize=(6,6))
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="LogReg")
plt.plot([0,1],[0,1],"--", label="Perfect calibration")
plt.xlabel("Probabilité prédite")
plt.ylabel("Proportion réelle")
plt.title("Calibration curve")
plt.legend()
plt.show()

# Brier score
print("Brier score:", brier_score_loss(y_test, proba_test))

Brier score: 0.1553319777601578

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

pipeline_smote_logreg = Pipeline([
    ("preprocessing", preprocessor_scaled),  
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(
        C=0.1,                     # meilleur C
        max_iter=2000,
        random_state=42
    ))
])

pipeline_smote_logreg.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'revenu_mensuel',
                                                   'nombre_experiences_precedentes',
                                                   'annees_dans_l_entreprise',
                                                   'annees_dans_le_poste_actuel',
                                                   'satisfaction_employee_environnement',
                                                   'satisfaction_employee_nature_travail',
                                                   'satisfaction_employee_equipe',
                                                   'satisfaction_employee_equilibre_pro_pe...
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['genre', 'statut_marital',
                                                   'departement', 'poste',
                                                   'domaine_etude']),
                                                 ('cat_ord',
                                                  OrdinalEncoder(categories=[['Aucun',
                                                                              'Occasionnel',
                                                                              'Frequent']],
                                                                 handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['frequence_deplacement'])])),
                ('smote', SMOTE(random_state=42)),
                ('model',
                 LogisticRegression(C=0.1, max_iter=2000, random_state=42))])

['age', 'revenu_mensuel', 'nombre_experiences_precedentes', 'annees_dans_l_entreprise', 'annees_dans_le_poste_actuel', 'satisfaction_employee_environnement', 'satisfaction_employee_nature_travail', 'satisfaction_employee_equipe', 'satisfaction_employee_equilibre_pro_perso', 'note_evaluation_actuelle', 'augmentation_salaire_precedente_pct', 'distance_domicile_travail', 'niveau_education', 'annees_depuis_la_derniere_promotion', 'annes_sous_responsable_actuel', 'annee_experience_avant_entreprise', 'evolution_note']

['utilisation_pee', 'mobilite_interne', 'a_suivi_formation', 'heures_supplementaires']

passthrough

['genre', 'statut_marital', 'departement', 'poste', 'domaine_etude']

['frequence_deplacement']

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

proba_test_smote = pipeline_smote_logreg.predict_proba(X_test)[:, 1]
y_pred_test_smote = (proba_test_smote >= 0.5).astype(int)

print("=== TEST SMOTE + LogReg ===")
print("Precision:", precision_score(y_test, y_pred_test_smote, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_test_smote, zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_test_smote, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_test, proba_test_smote))

print("\n", classification_report(y_test, y_pred_test_smote, zero_division=0))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_test_smote))

=== TEST SMOTE + LogReg ===
Precision: 0.38823529411764707
Recall   : 0.7021276595744681
F1-score : 0.5
ROC-AUC  : 0.8298733741062968

               precision    recall  f1-score   support

           0       0.93      0.79      0.86       247
           1       0.39      0.70      0.50        47

    accuracy                           0.78       294
   macro avg       0.66      0.75      0.68       294
weighted avg       0.85      0.78      0.80       294


Confusion matrix:
 [[195  52]
 [ 14  33]]

proba_train_smote = pipeline_smote_logreg.predict_proba(X_train)[:, 1]
thresholds = np.linspace(0.1, 0.9, 81)

best_f1 = 0
best_t = 0

for t in thresholds:
    y_pred_t = (proba_train_smote >= t).astype(int)
    f1 = f1_score(y_train, y_pred_t, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("Best threshold (train):", best_t)

Best threshold (train): 0.63

proba_train_smote = pipeline_smote_logreg.predict_proba(X_train)[:, 1]

#calcul la PR curve
precisions_smote, recalls_smote, thresholds_smote = precision_recall_curve(y_train, proba_train_smote)

#calcul F1 pour chaque seuil 
f1_scores_smote = 2 * (precisions_smote * recalls_smote) / (precisions_smote + recalls_smote + 1e-10)

#trouvons le meilleur f1 score 
best_idx_smote = np.argmax(f1_scores_smote[:-1])  # on enlève le dernier point

best_threshold_smote = thresholds_smote[best_idx_smote]
best_f1_smote = f1_scores_smote[best_idx_smote]

print("Best threshold (PR curve) :", best_threshold_smote)
print("Best F1 :", best_f1_smote)

#graph
plt.figure(figsize=(6,5))
plt.plot(recalls_smote, precisions_smote)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.scatter(recalls_smote[best_idx_smote], precisions_smote[best_idx_smote], color="red")
plt.show()

Best threshold (PR curve) : 0.6314599174980872
Best F1 : 0.6394230768734515

y_pred_test_smote_2 = (proba_test_smote >= 0.63).astype(int)

print("=== TEST SMOTE + LogReg ===")
print("Precision:", precision_score(y_test, y_pred_test_smote_2, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_test_smote_2, zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_test_smote_2, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_test, proba_test_smote))

print("\n", classification_report(y_test, y_pred_test_smote_2, zero_division=0))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_test_smote_2))

=== TEST SMOTE + LogReg ===
Precision: 0.5081967213114754
Recall   : 0.6595744680851063
F1-score : 0.5740740740740741
ROC-AUC  : 0.8298733741062968

               precision    recall  f1-score   support

           0       0.93      0.88      0.90       247
           1       0.51      0.66      0.57        47

    accuracy                           0.84       294
   macro avg       0.72      0.77      0.74       294
weighted avg       0.86      0.84      0.85       294


Confusion matrix:
 [[217  30]
 [ 16  31]]

import joblib
from sklearn.base import BaseEstimator, ClassifierMixin

class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, threshold=0.5):
        self.model = model
        self.threshold = threshold

    def fit(self, X, y=None):
        # modèle déjà entraîné (on ne refit pas)
        return self

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= self.threshold).astype(int)

# Création du classifieur final “avec seuil”
final_clf = ThresholdClassifier(best_logreg, threshold=float(best_threshold))

# Sauvegarde
joblib.dump(final_clf, "attrition_threshold_model.joblib")

['attrition_threshold_model.joblib']

# Test
final_clf_loaded = joblib.load("attrition_threshold_model.joblib")
final_clf_loaded.predict(X_test.iloc[:5]), final_clf_loaded.predict_proba(X_test.iloc[:5])[:, 1]

(array([0, 0, 0, 0, 0]),
 array([0.32824004, 0.06928329, 0.41496117, 0.07372709, 0.62645278]))

def scoring_attrition(
    X_employes: pd.DataFrame,
    clf,
    threshold: float,
    y_true: pd.Series | None = None,
    top_n: int = 10,
    cols_features: list[str] | None = None
) -> pd.DataFrame:

    proba = clf.predict_proba(X_employes)[:, 1]
    pred = (proba >= threshold).astype(int)

    df = pd.DataFrame(index=X_employes.index)
    if cols_features is None:
        # par défaut: on garde juste quelques colonnes (à adapter)
        cols_features = list(X_employes.columns[:6])  
    df = X_employes[cols_features].copy()

    df["proba_depart"] = proba
    df["prediction"] = pred

    # Catégories de risque 
    df["niveau_risque"] = pd.cut(
        df["proba_depart"],
        bins=[-np.inf, 0.30, 0.60, 0.80, np.inf],
        labels=["Faible", "Moyen", "Élevé", "Très élevé"]
    )

    df["decision"] = np.where(df["prediction"] == 1, "A risque (Parti=1)", "Faible risque (Resté=0)")
    
    if y_true is not None:
        df["y_true"] = y_true.loc[df.index].values
        df["resultat"] = np.where(df["prediction"] == df["y_true"], "✅ OK", "❌ Erreur")

    # tri: les plus à risque en haut
    df = df.sort_values("proba_depart", ascending=False).head(top_n)

    df["proba_depart"] = df["proba_depart"].round(3)

    return df

threshold = final_clf.threshold  
# Features à afficher 
cols = ["age", "revenu_mensuel", "satisfaction_employee_nature_travail", "annees_dans_l_entreprise", "heures_supplementaires"]  # à adapter selon colonnes clé RH 

top10 = scoring_attrition(
    X_test, final_clf, threshold,
    y_true=y_test,
    top_n=10,
    cols_features=cols
)

display(top10)

# récupérer le modèle logistique
logreg_model = best_logreg.named_steps["model"]

# récupérer les coefficients
coefficients = logreg_model.coef_[0]

# récupérer les noms des variables après preprocessing
feature_names = best_logreg.named_steps["preprocessing"].get_feature_names_out()

# créer un dataframe
coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefficients
})

# calcul des odds ratios
coef_df["odds_ratio"] = np.exp(coef_df["coefficient"])

# trier par importance absolue
coef_df["abs_coef"] = np.abs(coef_df["coefficient"])
coef_df = coef_df.sort_values("abs_coef", ascending=False)

coef_df.head(15)

# on garde les 15 plus influentes
top_n = 180
plot_df = coef_df.head(top_n).sort_values("coefficient")

plt.figure(figsize=(10, 8))
sns.barplot(
    data=plot_df,
    x="coefficient",
    y="feature",
    palette=["#d62728" if x > 0 else "#1f77b4" for x in plot_df["coefficient"]]
)

plt.axvline(0, color='black', linestyle='--')
plt.title("Impact global des variables (Régression Logistique)")
plt.xlabel("Coefficient (impact sur le log-odds)")
plt.ylabel("Variables")
plt.tight_layout()
plt.show()

C:\Users\bouzo\AppData\Local\Temp\ipykernel_17252\2521140529.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

plot_df = coef_df.head(top_n).sort_values("odds_ratio")

plt.figure(figsize=(10, 8))
sns.barplot(
    data=plot_df,
    x="odds_ratio",
    y="feature"
)

plt.axvline(1, color='black', linestyle='--')
plt.title("Odds Ratios des variables les plus influentes")
plt.xlabel("Odds Ratio")
plt.ylabel("Variables")
plt.tight_layout()
plt.show()

from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer, f1_score

clf = best_logreg

preproc = clf.named_steps["preprocessing"]
model = clf.named_steps["model"] 

X_test_t = preproc.transform(X_test)
feature_names = preproc.get_feature_names_out()

perm = permutation_importance(
    estimator=model,
    X=X_test_t,
    y=y_test,
    scoring=make_scorer(f1_score),
    n_repeats=30,
    random_state=42,
    n_jobs=-1
)

pi_df = pd.DataFrame({
    "feature": feature_names,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

pi_df.head(20)

top_n = 20
pi_top = pi_df.head(top_n).sort_values("importance_mean", ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(pi_top["feature"], pi_top["importance_mean"], xerr=pi_top["importance_std"])
plt.axvline(0, linestyle="--")
plt.title(f"Permutation Importance (TEST) - métrique: F1 (Top {top_n})")
plt.xlabel("Baisse moyenne du F1 après permutation")
plt.ylabel("Feature (après preprocessing)")
plt.tight_layout()
plt.show()

# 1) Récupérer preprocessing + modèle
preprocess = best_logreg.named_steps["preprocessing"]
model = best_logreg.named_steps["model"]  

# 2) Transformer les données 
X_train_proc = preprocess.transform(X_train)
X_test_proc  = preprocess.transform(X_test)

# 3) Noms de variables après preprocessing
feature_names = preprocess.get_feature_names_out()

# 4) Convertir en dense si besoin
if hasattr(X_train_proc, "toarray"):
    X_train_proc = X_train_proc.toarray()
    X_test_proc  = X_test_proc.toarray()

import shap
# sous-échantillon pour aller plus vite
idx = np.random.RandomState(42).choice(X_train_proc.shape[0], size=min(300, X_train_proc.shape[0]), replace=False)
X_background = X_train_proc[idx]

# 1) Explainer 
explainer = shap.LinearExplainer(model, X_background, feature_names=feature_names)

# 2) Calcul shap values sur test (ou train)
shap_values = explainer.shap_values(X_test_proc)

# 3) Beeswarm 
plt.figure()
shap.summary_plot(
    shap_values,
    X_test_proc,
    feature_names=feature_names,
    plot_type="dot",
    max_display=20,   # top 20 lisible
    show=True
)

# importance globale SHAP = moyenne des valeurs absolues
shap_importance = np.abs(shap_values).mean(axis=0)

shap_df = pd.DataFrame({
    "feature": feature_names,
    "shap_mean_abs": shap_importance
})

shap_df

coef_clean = coef_df[["feature", "abs_coef"]].copy()
coef_clean

pi_clean = pi_df[["feature", "importance_mean"]].copy()
pi_clean

compare_df = coef_clean.merge(pi_clean, on="feature")
compare_df = compare_df.merge(shap_df, on="feature")

compare_df.head()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

compare_df[["coef_scaled",
            "perm_scaled",
            "shap_scaled"]] = scaler.fit_transform(
    compare_df[["abs_coef",
                "importance_mean",
                "shap_mean_abs"]]
)

top_n = 10

compare_df["mean_global"] = compare_df[["coef_scaled",
                                        "perm_scaled",
                                        "shap_scaled"]].mean(axis=1)

top_features = compare_df.sort_values("mean_global",
                                      ascending=False).head(top_n)

plt.figure(figsize=(10,6))

plt.plot(top_features["feature"],
         top_features["coef_scaled"],
         marker="o", label="Coefficient")

plt.plot(top_features["feature"],
         top_features["perm_scaled"],
         marker="o", label="Permutation")

plt.plot(top_features["feature"],
         top_features["shap_scaled"],
         marker="o", label="SHAP")

plt.xticks(rotation=45, ha="right")
plt.ylabel("Importance normalisée")
plt.title("Comparaison des méthodes d'importance globale")
plt.legend()
plt.tight_layout()
plt.show()

heatmap_df = top_features.set_index("feature")[[
    "coef_scaled",
    "perm_scaled",
    "shap_scaled"
]]

plt.figure(figsize=(8,6))
sns.heatmap(heatmap_df,
            annot=True,
            cmap="Blues")
plt.title("Convergence des méthodes d'importance")
plt.show()

type(shap_values), getattr(shap_values,"shape",None)

(numpy.ndarray, (294, 40))

shap_exp = shap.Explanation(
    values=shap_values,
    base_values=np.repeat(explainer.expected_value, shap_values.shape[0]),
    data=X_test_proc,                 # les X transformés (après preprocess)
    feature_names=feature_names
)

shap.plots.scatter(
    shap_exp[:, "num__augmentation_salaire_precedente_pct"],
    color=shap_exp[:, "num__note_evaluation_actuelle"],
    alpha=0.6,
    dot_size=30
)

shap.plots.scatter(
    shap_exp[:, "num__annees_depuis_la_derniere_promotion"],
    color=shap_exp[:, "num__revenu_mensuel"],
    alpha=0.6,
    dot_size=30
)

probas = best_logreg.predict_proba(X_test)[:, 1]

idx_high = probas.argmax()   # plus fort risque
idx_low = probas.argmin()    # plus faible risque

print("Proba max :", probas[idx_high])
print("Proba min :", probas[idx_low])

Proba max : 0.9795496244411475
Proba min : 0.01617427793522368

if isinstance(shap_values, list):
    shap_vals_class1 = shap_values[1]
else:
    shap_vals_class1 = shap_values

print("Probabilité prédite :", probas[idx_high])
shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_class1[idx_high],
        base_values=explainer.expected_value,
        data=X_test_proc[idx_high],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.9795496244411475

print("Probabilité prédite :", probas[idx_low])
shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_class1[idx_low],
        base_values=explainer.expected_value,
        data=X_test_proc[idx_low],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.01617427793522368

pred_custom = (probas >= best_threshold).astype(int)

analysis_df = pd.DataFrame({
    "proba": probas,
    "prediction": pred_custom,
    "true_value": y_test.values
})

lower = 0.59
upper = 0.69

border_correct = analysis_df[
    (analysis_df["proba"].between(lower, upper)) &
    (analysis_df["prediction"] == analysis_df["true_value"])
]

idx_border_correct = border_correct.index[0]
analysis_df.loc[idx_border_correct]

proba         0.603298
prediction    0.000000
true_value    0.000000
Name: 18, dtype: float64

print("Probabilité prédite :", probas[idx_border_correct])
shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_class1[idx_border_correct],
        base_values=explainer.expected_value,
        data=X_test_proc[idx_border_correct],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.6032981544390328

border_incorrect = analysis_df[
    (analysis_df["proba"].between(lower, upper)) &
    (analysis_df["prediction"] != analysis_df["true_value"])
]

idx_border_incorrect = border_incorrect.index[0]
analysis_df.loc[idx_border_incorrect]

proba         0.626453
prediction    0.000000
true_value    1.000000
Name: 4, dtype: float64

print("Probabilité prédite :", probas[idx_border_incorrect])
shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_class1[idx_border_incorrect],
        base_values=explainer.expected_value,
        data=X_test_proc[idx_border_incorrect],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.6264527802865946

	steps steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.	[('preprocessing', ...), ('model', ...)]
	transform_input transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6	None
	memory memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.	None
	verbose verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.	False

	transformers transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str Like in Pipeline and FeatureUnion, this allows the transformer and its parameters to be set using ``set_params`` and searched in grid search. transformer : {'drop', 'passthrough'} or estimator Estimator must support :term:`fit` and :term:`transform`. Special-cased strings 'drop' and 'passthrough' are accepted as well, to indicate to drop the columns or to pass them through untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where ``transformer`` expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. A callable is passed the input data `X` and can return any of the above. To select multiple columns by name or dtype, you can use :obj:`make_column_selector`.	[('num', ...), ('bin', ...), ...]
	remainder remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.	'drop'
	sparse_threshold sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.	0.3
	n_jobs n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	transformer_weights transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.	None
	verbose verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.	False
	verbose_feature_names_out verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix all feature names with the name of the transformer that generated that feature. It is equivalent to setting `verbose_feature_names_out="{transformer_name}__{feature_name}"`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not prefix any feature names and will error if feature names are not unique. - If ``Callable[[str, str], str]``, :meth:`ColumnTransformer.get_feature_names_out` will rename all the features using the name of the transformer. The first argument of the callable is the transformer name and the second argument is the feature name. The returned string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will be formatted using two field names: ``transformer_name`` and ``feature_name``. e.g. ``"{feature_name}__{transformer_name}"``. See :meth:`str.format` method from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6 `verbose_feature_names_out` can be a callable or a string to be formatted.	True
	force_int_remainder_cols force_int_remainder_cols: bool, default=False This parameter has no effect. .. note:: If you do not access the list of columns for the remainder columns in the `transformers_` fitted attribute, you do not need to set this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7 The default value for `force_int_remainder_cols` will change from `True` to `False` in version 1.7. .. deprecated:: 1.7 `force_int_remainder_cols` is deprecated and will be removed in 1.9.	'deprecated'

	categories categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute. .. versionadded:: 0.20	'auto'
	drop drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into an unregularized linear regression model. However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two categories. Features with 1 or more than 2 categories are left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. When `max_categories` or `min_frequency` is configured to group infrequent categories, the dropping behavior is handled after the grouping. .. versionadded:: 0.21 The parameter `drop` was added in 0.21. .. versionchanged:: 0.23 The option `drop='if_binary'` was added in 0.23. .. versionchanged:: 1.1 Support for dropping infrequent categories.	'first'
	sparse_output sparse_output: bool, default=True When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format. .. versionadded:: 1.2 `sparse` was renamed to `sparse_output`	True
	dtype dtype: number type, default=np.float64 Desired dtype of output.	<class 'numpy.float64'>
	handle_unknown handle_unknown: {'error', 'ignore', 'infrequent_if_exist', 'warn'}, default='error' Specifies the way unknown categories are handled during :meth:`transform`. - 'error' : Raise an error if an unknown category is present during transform. - 'ignore' : When an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros. In the inverse transform, an unknown category will be denoted as None. - 'infrequent_if_exist' : When an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will map to the infrequent category if it exists. The infrequent category will be mapped to the last position in the encoding. During inverse transform, an unknown category will be mapped to the category denoted `'infrequent'` if it exists. If the `'infrequent'` category does not exist, then :meth:`transform` and :meth:`inverse_transform` will handle an unknown category as with `handle_unknown='ignore'`. Infrequent categories exist based on `min_frequency` and `max_categories`. Read more in the :ref:`User Guide `. - 'warn' : When an unknown category is encountered during transform a warning is issued, and the encoding then proceeds as described for `handle_unknown="infrequent_if_exist"`. .. versionchanged:: 1.1 `'infrequent_if_exist'` was added to automatically handle unknown categories and infrequent categories. .. versionadded:: 1.6 The option `"warn"` was added in 1.6.	'ignore'
	min_frequency min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered infrequent. - If `float`, categories with a smaller cardinality than `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.1 Read more in the :ref:`User Guide `.	None
	max_categories max_categories: int, default=None Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. .. versionadded:: 1.1 Read more in the :ref:`User Guide `.	None
	feature_name_combiner feature_name_combiner: "concat" or callable, default="concat" Callable with signature `def callable(input_feature, category)` that returns a string. This is used to create feature names to be returned by :meth:`get_feature_names_out`. `"concat"` concatenates encoded feature name and category with `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create feature names `X_1, X_6, X_7`. .. versionadded:: 1.3	'concat'

	categories categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute.	[['Aucun', 'Occasionnel', ...]]
	dtype dtype: number type, default=np.float64 Desired dtype of output.	<class 'numpy.float64'>
	handle_unknown handle_unknown: {'error', 'use_encoded_value'}, default='error' When set to 'error' an error will be raised in case an unknown categorical feature is present during transform. When set to 'use_encoded_value', the encoded value of unknown categories will be set to the value given for the parameter `unknown_value`. In :meth:`inverse_transform`, an unknown category will be denoted as None. .. versionadded:: 0.24	'use_encoded_value'
	unknown_value unknown_value: int or np.nan, default=None When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of the categories in `fit`. If set to np.nan, the `dtype` parameter must be a float dtype. .. versionadded:: 0.24	-1
	encoded_missing_value encoded_missing_value: int or np.nan, default=np.nan Encoded value of missing categories. If set to `np.nan`, then the `dtype` parameter must be a float dtype. .. versionadded:: 1.1	nan
	min_frequency min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered infrequent. - If `float`, categories with a smaller cardinality than `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.3 Read more in the :ref:`User Guide `.	None
	max_categories max_categories: int, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. `max_categories` do not take into account missing or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an integer will increase the number of unique integer codes by one each. This can result in up to `max_categories + 2` integer codes. .. versionadded:: 1.3 Read more in the :ref:`User Guide `.	None

	penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning:: Some penalties may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8 `penalty` was deprecated in version 1.8 and will be removed in 1.10. Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for `'penalty='elasticnet'`.	'deprecated'
	C C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.	1.0
	l1_ratio l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning:: Certain values of `l1_ratio`, i.e. some penalties, may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionchanged:: 1.8 Default value changed from None to 0.0. .. deprecated:: 1.8 `None` is deprecated and will be removed in version 1.10. Always use `l1_ratio` to specify the penalty type.	0.0
	dual dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.	False
	tol tol: float, default=1e-4 Tolerance for stopping criteria.	0.0001
	fit_intercept fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.	True
	intercept_scaling intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a "synthetic" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note:: The synthetic feature weight is subject to L1 or L2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) `intercept_scaling` has to be increased.	1
	class_weight class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 class_weight='balanced'	'balanced'
	random_state random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.	42
	solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except 'liblinear' minimize the full multinomial loss, 'liblinear' will raise an error. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features * n_classes`, especially with one-hot encoded categorical features with rare categories. Be aware that the memory usage of this solver has a quadratic dependency on `n_features * n_classes` because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a one-versus-rest scheme for the multiclass setting one can wrap it with the :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning:: The choice of the algorithm depends on the penalty chosen (`l1_ratio=0` for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for Elastic-Net) and on (multinomial) multiclass support: ================= ======================== ====================== solver l1_ratio multinomial multiclass ================= ======================== ====================== 'lbfgs' l1_ratio=0 yes 'liblinear' l1_ratio=1 or l1_ratio=0 no 'newton-cg' l1_ratio=0 yes 'newton-cholesky' l1_ratio=0 yes 'sag' l1_ratio=0 yes 'saga' 0<=l1_ratio<=1 yes ================= ======================== ====================== .. note:: 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from :mod:`sklearn.preprocessing`. .. seealso:: Refer to the :ref:`User Guide ` for more information regarding :class:`LogisticRegression` and more specifically the :ref:`Table ` summarizing solver/penalty supports. .. versionadded:: 0.17 Stochastic Average Gradient (SAG) descent solver. Multinomial support in version 0.18. .. versionadded:: 0.19 SAGA solver. .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. Multinomial support in version 1.6.	'lbfgs'
	max_iter max_iter: int, default=100 Maximum number of iterations taken for the solvers to converge.	2000
	verbose verbose: int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.	0
	warm_start warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. .. versionadded:: 0.17 warm_start to support lbfgs, newton-cg, sag, saga solvers.	False
	n_jobs n_jobs: int, default=None Does not have any effect. .. deprecated:: 1.8 `n_jobs` is deprecated in version 1.8 and will be removed in 1.10.	None

Projet : Identifiez les causes d'attrition au sein d'une ESN

Notebook 2 : Modélisation

Section 1 : Importation et préparation des données

Importation dataset¶

Séparation du jeu de données en X/y¶

Matrice de corrélation de Pearson¶

Preprocessing¶

Split Train/Test stratifié¶

Pipeline pour chaque modèle¶

Section 2 : Modélisation

Validation croisée¶

Régression logistique¶

Overfitting ?¶

Optimisation : recherche du meilleur paramètre C + recherche du meilleur Seuil¶

essai du LogisticRegression + SMOTE¶

Section 3 : Sauvegarde du modèle retenu

Sauvegarde¶

Fonction de scoring (démonstration, exemple d'usage)¶

Section 4 : Interprétabilité et Résultats

Interprétabilité des métriques¶

Interprétation globale :¶

Scatter SHAP (relation feature/impact)¶

Interprétatin locale (SHAP) : Waterfall plot¶

Interprétation locale (cas borderline)¶

	age	genre	revenu_mensuel	statut_marital	departement	poste	nombre_experiences_precedentes	annees_dans_l_entreprise	annees_dans_le_poste_actuel	satisfaction_employee_environnement	niveau_hierarchique_poste	satisfaction_employee_nature_travail	satisfaction_employee_equipe	satisfaction_employee_equilibre_pro_perso	note_evaluation_actuelle	augmentation_salaire_precedente_pct	heures_supplementaires	a_quitte_l_entreprise	distance_domicile_travail	niveau_education	domaine_etude	frequence_deplacement	annees_depuis_la_derniere_promotion	annes_sous_responsable_actuel	a_suivi_formation	annee_experience_avant_entreprise	mobilite_interne	evolution_note	utilisation_pee
0	41	F	5993	Célibataire	Commercial	Cadre Commercial	8	6	4	2	2	4	1	1	3	11	1	1	1	2	Infra & Cloud	Occasionnel	0	5	0	2	1	0	0
1	49	M	5130	Marié(e)	Consulting	Assistant de Direction	1	10	7	3	2	2	4	3	4	23	0	0	8	1	Infra & Cloud	Frequent	1	7	1	0	1	2	1
2	37	M	2090	Célibataire	Consulting	Consultant	6	0	0	4	1	3	2	3	3	15	1	1	2	2	Autre	Occasionnel	0	0	1	7	0	1	0
3	33	F	2909	Marié(e)	Consulting	Assistant de Direction	1	8	7	4	1	3	3	3	3	11	1	0	3	4	Infra & Cloud	Frequent	3	0	1	0	1	0	0
4	27	M	3468	Marié(e)	Consulting	Consultant	9	2	2	1	1	2	4	3	3	12	0	0	2	1	Transformation Digitale	Occasionnel	2	2	1	4	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1465	36	M	2571	Marié(e)	Consulting	Consultant	4	5	2	3	2	4	3	3	3	17	0	0	23	2	Transformation Digitale	Frequent	0	3	1	12	1	-1	1
1466	39	M	9991	Marié(e)	Consulting	Manager	4	7	7	4	3	1	1	3	3	15	0	0	6	1	Transformation Digitale	Occasionnel	1	7	1	2	0	1	1
1467	27	M	6142	Marié(e)	Consulting	Tech Lead	1	6	2	2	2	2	2	3	4	20	1	0	4	3	Infra & Cloud	Occasionnel	0	3	0	0	1	0	1
1468	49	M	5390	Marié(e)	Commercial	Cadre Commercial	2	9	6	4	2	2	4	2	3	14	0	0	2	3	Transformation Digitale	Frequent	0	8	1	8	1	1	0
1469	34	M	4404	Marié(e)	Consulting	Consultant	2	4	3	2	2	3	1	4	3	12	0	0	8	3	Transformation Digitale	Occasionnel	1	2	1	2	1	-1	0

	copy copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.	True
	with_mean with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.	True
	with_std with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).	True

	steps	[('preprocessing', ...), ('smote', ...), ...]
	transform_input	None
	memory	None
	verbose	False

	age	revenu_mensuel	satisfaction_employee_nature_travail	annees_dans_l_entreprise	heures_supplementaires	proba_depart	prediction	niveau_risque	decision	y_true	resultat
357	21	2174	2	3	1	0.980	1	Très élevé	A risque (Parti=1)	1	✅ OK
911	25	1118	4	1	1	0.973	1	Très élevé	A risque (Parti=1)	1	✅ OK
688	19	2121	2	1	1	0.953	1	Très élevé	A risque (Parti=1)	1	✅ OK
1021	25	4400	1	3	0	0.939	1	Très élevé	A risque (Parti=1)	1	✅ OK
514	33	3348	1	10	1	0.932	1	Très élevé	A risque (Parti=1)	1	✅ OK
182	41	3140	2	4	1	0.898	1	Très élevé	A risque (Parti=1)	1	✅ OK
762	26	2042	1	3	1	0.891	1	Très élevé	A risque (Parti=1)	1	✅ OK
525	24	4577	3	2	0	0.886	1	Très élevé	A risque (Parti=1)	1	✅ OK
293	26	5828	4	8	1	0.881	1	Très élevé	A risque (Parti=1)	1	✅ OK
829	33	8224	1	5	1	0.877	1	Très élevé	A risque (Parti=1)	1	✅ OK

	feature	coefficient	odds_ratio	abs_coef
20	bin__heures_supplementaires	1.197854	3.313001	1.197854
27	cat_nom__poste_Consultant	0.745098	2.106648	0.745098
17	bin__utilisation_pee	-0.682155	0.505526	0.682155
24	cat_nom__departement_Consulting	-0.643139	0.525640	0.643139
39	cat_ord__frequence_deplacement	0.625726	1.869603	0.625726
19	bin__a_suivi_formation	-0.571745	0.564540	0.571745
30	cat_nom__poste_Représentant Commercial	0.504077	1.655457	0.504077
13	num__annees_depuis_la_derniere_promotion	0.424544	1.528893	0.424544
22	cat_nom__statut_marital_Divorcé(e)	-0.372895	0.688738	0.372895
14	num__annes_sous_responsable_actuel	-0.366427	0.693207	0.366427
2	num__nombre_experiences_precedentes	0.360334	1.433808	0.360334
28	cat_nom__poste_Directeur Technique	-0.353107	0.702502	0.353107
5	num__satisfaction_employee_environnement	-0.351807	0.703416	0.351807
6	num__satisfaction_employee_nature_travail	-0.330780	0.718363	0.330780
16	num__evolution_note	0.313251	1.367865	0.313251

	feature	importance_mean	importance_std
20	bin__heures_supplementaires	0.084507	0.023779
17	bin__utilisation_pee	0.015860	0.016684
2	num__nombre_experiences_precedentes	0.010798	0.020155
13	num__annees_depuis_la_derniere_promotion	0.009291	0.018127
35	cat_nom__domaine_etude_Infra & Cloud	0.008865	0.008706
19	bin__a_suivi_formation	0.007986	0.006205
6	num__satisfaction_employee_nature_travail	0.007729	0.019832
22	cat_nom__statut_marital_Divorcé(e)	0.007163	0.008596
34	cat_nom__domaine_etude_Entrepreunariat	0.006921	0.005205
39	cat_ord__frequence_deplacement	0.005123	0.012924
26	cat_nom__poste_Cadre Commercial	0.002676	0.001477
33	cat_nom__poste_Tech Lead	0.001855	0.002529
12	num__niveau_education	0.000698	0.001396
28	cat_nom__poste_Directeur Technique	0.000582	0.003931
32	cat_nom__poste_Senior Manager	0.000000	0.000000
31	cat_nom__poste_Ressources Humaines	0.000000	0.000000
18	bin__mobilite_interne	0.000000	0.000000
16	num__evolution_note	-0.000654	0.018278
25	cat_nom__departement_Ressources Humaines	-0.000708	0.001417
29	cat_nom__poste_Manager	-0.000964	0.003411

	feature	shap_mean_abs
0	num__age	0.204468
1	num__revenu_mensuel	0.069687
2	num__nombre_experiences_precedentes	0.288259
3	num__annees_dans_l_entreprise	0.053539
4	num__annees_dans_le_poste_actuel	0.208085
5	num__satisfaction_employee_environnement	0.313867
6	num__satisfaction_employee_nature_travail	0.277726
7	num__satisfaction_employee_equipe	0.180584
8	num__satisfaction_employee_equilibre_pro_perso	0.136182
9	num__note_evaluation_actuelle	0.021943
10	num__augmentation_salaire_precedente_pct	0.135751
11	num__distance_domicile_travail	0.239198
12	num__niveau_education	0.016701
13	num__annees_depuis_la_derniere_promotion	0.287549
14	num__annes_sous_responsable_actuel	0.300234
15	num__annee_experience_avant_entreprise	0.206260
16	num__evolution_note	0.265692
17	bin__utilisation_pee	0.331379
18	bin__mobilite_interne	0.001754
19	bin__a_suivi_formation	0.046129
20	bin__heures_supplementaires	0.471645
21	cat_nom__genre_M	0.094788
22	cat_nom__statut_marital_Divorcé(e)	0.127495
23	cat_nom__statut_marital_Marié(e)	0.103460
24	cat_nom__departement_Consulting	0.304069
25	cat_nom__departement_Ressources Humaines	0.009558
26	cat_nom__poste_Cadre Commercial	0.018428
27	cat_nom__poste_Consultant	0.174160
28	cat_nom__poste_Directeur Technique	0.042277
29	cat_nom__poste_Manager	0.046656
30	cat_nom__poste_Représentant Commercial	0.073451
31	cat_nom__poste_Ressources Humaines	0.000527
32	cat_nom__poste_Senior Manager	0.000842
33	cat_nom__poste_Tech Lead	0.012200
34	cat_nom__domaine_etude_Entrepreunariat	0.030303
35	cat_nom__domaine_etude_Infra & Cloud	0.056299
36	cat_nom__domaine_etude_Marketing	0.036949
37	cat_nom__domaine_etude_Ressources Humaines	0.011634
38	cat_nom__domaine_etude_Transformation Digitale	0.081099
39	cat_ord__frequence_deplacement	0.236882

	sampling_strategy	'auto'
	random_state	42
	k_neighbors	5