import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

from pathlib import Path


pd.set_option("display.max_columns",200)
pd.set_option("display.max_rows",200)

df = pd.read_csv("../data/processed/heart_cleaning.csv")

df

X = df.drop(columns=["HeartDisease"]) 
y = df["HeartDisease"]

#définir les colonnes numériques vs catégorielles
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

numeric_features = [
    'Age',
    'RestingBP',
    'Cholesterol',
    'MaxHR',
    'Oldpeak'
]

binary_features = [
    'FastingBS',
    'Cholesterol_missing'
]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# --- Préprocesseur pour modèle linéaire (scaling utile) ---
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat_nom", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop"
)

# --- Préprocesseur pour modèles arbres (pas de scaling) ---
preprocessor_noscale = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat_nom", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop"
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y 
)

X_test.shape

(184, 12)

from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

dummy_pipe = Pipeline(steps=[
    ("preprocessing", preprocessor_noscale),
    ("model", DummyClassifier(strategy="most_frequent", random_state=42)) 
])

logreg_pipe = Pipeline([
    ("preprocessing", preprocessor_scaled),
    ("model", LogisticRegression(
        max_iter=2000, 
        random_state=42))
])

logreg_balanced_pipe = Pipeline([
    ("preprocessing", preprocessor_scaled),
    ("model", LogisticRegression(
        max_iter=2000, 
        class_weight="balanced",
        random_state=42))
])

svc_pipe = Pipeline([
    ("preprocessing", preprocessor_scaled),
    ("model", SVC(
        kernel="linear",
        probability=True,  # IMPORTANT pour ROC-AUC (sinon predict_proba() ne marche pas et donc pas de ROC_AUC)
        random_state=42
    ))
])

svc_balanced_pipe = Pipeline([
    ("preprocessing", preprocessor_scaled),
    ("model", SVC(
        kernel="linear",
        class_weight="balanced",
        probability=True,
        random_state=42
    ))
])

rf_pipe = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", RandomForestClassifier(
        n_estimators=500,
        random_state=42
    ))
])

rf_balanced_pipe = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced"
    ))
])

xgb_pipe = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    ))
])

n_neg = np.sum(y_train == 0)
n_pos = np.sum(y_train == 1)

ratio = n_neg / n_pos

print(ratio)

0.8078817733990148

xgb_balanced_pipe = Pipeline([
    ("preprocessing", preprocessor_noscale),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss",
        scale_pos_weight=ratio,
    ))
])

import mlflow
import mlflow.sklearn

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    fbeta_score,
    roc_auc_score,
    make_scorer,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay
)

import matplotlib.pyplot as plt

MLRUNS_DIR = Path("mlruns")
MLRUNS_DIR.mkdir(exist_ok=True)

mlflow.set_tracking_uri(MLRUNS_DIR.resolve().as_uri())
mlflow.set_experiment("heart_disease_classification")

print("Tracking URI:", mlflow.get_tracking_uri())

c:\Users\bouzo\Desktop\Projet_6_IA_13042026\.venv\Lib\site-packages\mlflow\tracking\_tracking_service\utils.py:184: FutureWarning: The filesystem tracking backend (e.g., './mlruns') is deprecated as of February 2026. Consider transitioning to a database backend (e.g., 'sqlite:///mlflow.db') to take advantage of the latest MLflow features. See https://mlflow.org/docs/latest/self-hosting/migrate-from-file-store for migration guidance.
  return FileStore(store_uri, store_uri)
2026/04/25 15:31:18 INFO mlflow.tracking.fluent: Experiment with name 'heart_disease_classification' does not exist. Creating a new experiment.

Tracking URI: file:///C:/Users/bouzo/Desktop/Projet_6_IA_13042026/notebooks/mlruns

models = {
    "Dummy": dummy_pipe,
    "LogisticRegression": logreg_pipe,
    "LogisticRegression_Balanced": logreg_balanced_pipe,
    "SVC": svc_pipe,
    "SVC_Balanced": svc_balanced_pipe,
    "RandomForest": rf_pipe,
    "RandomForest_Balanced": rf_balanced_pipe,
    "XGBoost": xgb_pipe,
    "XGBoost_Balanced": xgb_balanced_pipe
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
    "f2": make_scorer(fbeta_score, beta=2),
    "roc_auc": "roc_auc"
}

mlflow.set_experiment("heart_disease_classification")

results = []

for model_name, pipeline in models.items():
    
    with mlflow.start_run(run_name=model_name):
        
        # =========================
        # Validation croisée
        # =========================
        cv_results = cross_validate(
            pipeline,
            X_train,
            y_train,
            cv=cv,
            scoring=scoring,
            n_jobs=-1
        )
        
        cv_accuracy = cv_results["test_accuracy"].mean()
        cv_precision = cv_results["test_precision"].mean()
        cv_recall = cv_results["test_recall"].mean()
        cv_f1 = cv_results["test_f1"].mean()
        cv_f2 = cv_results["test_f2"].mean()
        cv_roc_auc = cv_results["test_roc_auc"].mean()
        
        # =========================
        # Entraînement final sur train
        # =========================
        pipeline.fit(X_train, y_train)
        
        # =========================
        # Évaluation sur test
        # =========================
        y_pred = pipeline.predict(X_test)
        
        y_proba = pipeline.predict_proba(X_test)[:, 1]
        
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred, zero_division=0)
        test_recall = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)
        test_f2 = fbeta_score(y_test, y_pred, beta=2, zero_division=0)
        test_roc_auc = roc_auc_score(y_test, y_proba)
        
        # =========================
        # Log MLflow : métriques utiles
        # =========================
        mlflow.log_param("model_name", model_name)
        
        mlflow.log_metric("cv_accuracy", cv_accuracy)
        mlflow.log_metric("cv_precision", cv_precision)
        mlflow.log_metric("cv_recall", cv_recall)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("cv_f2", cv_f2)
        mlflow.log_metric("cv_roc_auc", cv_roc_auc)
        
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_precision", test_precision)
        mlflow.log_metric("test_recall", test_recall)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("test_f2", test_f2)
        mlflow.log_metric("test_roc_auc", test_roc_auc)
        
        # =========================
        # Log artefacts : matrice confusion + ROC
        # =========================
        fig, ax = plt.subplots()
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
        plt.title(f"Confusion Matrix - {model_name}")
        cm_path = f"confusion_matrix_{model_name}.png"
        plt.savefig(cm_path, bbox_inches="tight")
        mlflow.log_artifact(cm_path)
        plt.close()
        
        fig, ax = plt.subplots()
        RocCurveDisplay.from_predictions(y_test, y_proba, ax=ax)
        plt.title(f"ROC Curve - {model_name}")
        roc_path = f"roc_curve_{model_name}.png"
        plt.savefig(roc_path, bbox_inches="tight")
        mlflow.log_artifact(roc_path)
        plt.close()
        
        # =========================
        # Log modèle
        # =========================
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model"
        )
        
        # =========================
        # Résultats tableau
        # =========================
        results.append({
            "model": model_name,
            "cv_recall": cv_recall,
            "cv_f2": cv_f2,
            "cv_f1": cv_f1,
            "cv_roc_auc": cv_roc_auc,
            "test_recall": test_recall,
            "test_f2": test_f2,
            "test_f1": test_f1,
            "test_roc_auc": test_roc_auc,
            "test_precision": test_precision,
            "test_accuracy": test_accuracy
        })

2026/04/25 15:31:25 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:31:25 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:31:38 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:31:38 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:31:49 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:31:49 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:31:56 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:31:56 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:32:03 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:32:03 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:32:13 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:32:13 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
2026/04/25 15:32:25 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:32:25 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
c:\Users\bouzo\Desktop\Projet_6_IA_13042026\.venv\Lib\site-packages\xgboost\training.py:200: UserWarning: [15:32:31] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:782: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
2026/04/25 15:32:32 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:32:32 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html
c:\Users\bouzo\Desktop\Projet_6_IA_13042026\.venv\Lib\site-packages\xgboost\training.py:200: UserWarning: [15:32:38] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:782: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
2026/04/25 15:32:39 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2026/04/25 15:32:39 WARNING mlflow.sklearn: Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution because these formats rely on Python's object serialization mechanism, which can execute arbitrary code during deserialization. The recommended safe alternative is the 'skops' format. For more information, see: https://scikit-learn.org/stable/model_persistence.html

import os

print("cwd :", os.getcwd())
print("tracking_uri :", mlflow.get_tracking_uri())
print("experiments :")
print(mlflow.search_experiments())

cwd : c:\Users\bouzo\Desktop\Projet_6_IA_13042026\notebooks
tracking_uri : file:///C:/Users/bouzo/Desktop/Projet_6_IA_13042026/notebooks/mlruns
experiments :
[<Experiment: artifact_location='file:///C:/Users/bouzo/Desktop/Projet_6_IA_13042026/notebooks/mlruns/212082661121986186', creation_time=1777123878877, experiment_id='212082661121986186', last_update_time=1777123878877, lifecycle_stage='active', name='heart_disease_classification', tags={}, trace_location=None, workspace='default'>]

runs = mlflow.search_runs()
runs

results_df = pd.DataFrame(results)

results_df = results_df.sort_values(
    by=["test_f2", "test_recall", "test_roc_auc"],
    ascending=False
)

results_df

results_df.to_csv("../reports/model_comparison_results.csv", index=False)

from sklearn.model_selection import GridSearchCV

f2_scorer = make_scorer(fbeta_score, beta=2)

param_grid = {
    "model__n_estimators": [300, 500],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__class_weight": ["balanced"]
}

grid_search = GridSearchCV(
    estimator=rf_balanced_pipe,
    param_grid=param_grid,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV F2:", grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best params: {'model__class_weight': 'balanced', 'model__max_depth': None, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Best CV F2: 0.8948992578648551

best_rf_model = grid_search.best_estimator_

from sklearn.metrics import classification_report

y_pred = best_rf_model.predict(X_test)
y_proba = best_rf_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title("Confusion Matrix - Tuned RandomForest")
plt.show()

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve - Tuned RandomForest")
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.85      0.88        82
           1       0.89      0.93      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184

from sklearn.metrics import precision_recall_curve

# probabilités
y_proba = best_rf_model.predict_proba(X_test)[:, 1]

# PR curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# calcul F2
beta = 2
f2_scores = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall + 1e-10)

# meilleur seuil
best_idx = np.argmax(f2_scores)
best_threshold = thresholds[best_idx]

print("Best threshold:", best_threshold)
print("Best F2:", f2_scores[best_idx])

Best threshold: 0.477421727458697
Best F2: 0.9302325581188479

plt.figure(figsize=(6,4))
plt.plot(recall, precision, label="PR Curve")
plt.scatter(recall[best_idx], precision[best_idx], color="red", label="Best F2 point")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

y_pred_opt = (y_proba >= best_threshold).astype(int)

y_pred_opt = best_rf_model.predict(X_test)
y_proba = best_rf_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_opt))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_opt)
plt.title("Confusion Matrix - Tuned RandomForest")
plt.show()

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve - Tuned RandomForest")
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.85      0.88        82
           1       0.89      0.93      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184

import joblib
from sklearn.base import BaseEstimator, ClassifierMixin

class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, threshold=0.5):
        self.model = model
        self.threshold = threshold

    def fit(self, X, y=None):
        # modèle déjà entraîné (on ne refit pas)
        return self

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= self.threshold).astype(int)

# Création du classifieur final “avec seuil”
final_clf = ThresholdClassifier(best_rf_model, threshold=float(best_threshold))

# Sauvegarde
joblib.dump(final_clf, "heart_threshold_model.joblib")

['heart_threshold_model.joblib']

# Test
final_clf_loaded = joblib.load("heart_threshold_model.joblib")
final_clf_loaded.predict(X_test.iloc[:5]), final_clf_loaded.predict_proba(X_test.iloc[:5])[:, 1]

(array([1, 0, 1, 1, 0]),
 array([0.95844599, 0.21552239, 0.93021092, 0.58731918, 0.38139272]))

def scoring_heart(
    X_patient: pd.DataFrame,
    clf,
    threshold: float,
    y_true: pd.Series | None = None,
    top_n: int = 10,
    cols_features: list[str] | None = None
) -> pd.DataFrame:

    proba = clf.predict_proba(X_patient)[:, 1]
    pred = (proba >= threshold).astype(int)

    df = pd.DataFrame(index=X_patient.index)
    if cols_features is None:
        # par défaut: on garde juste quelques colonnes (à adapter)
        cols_features = list(X_patient.columns[:6])  
    df = X_patient[cols_features].copy()

    df["proba_maladie"] = proba
    df["prediction"] = pred

    # Catégories de risque 
    df["niveau_risque"] = pd.cut(
        df["proba_maladie"],
        bins=[-np.inf, 0.30, 0.60, 0.80, np.inf],
        labels=["Faible", "Moyen", "Élevé", "Très élevé"]
    )

    df["decision"] = np.where(df["prediction"] == 1, "A risque (Malade=1)", "Faible risque (Sain=0)")
    
    if y_true is not None:
        df["y_true"] = y_true.loc[df.index].values
        df["resultat"] = np.where(df["prediction"] == df["y_true"], "✅ OK", "❌ Erreur")

    # tri: les plus à risque en haut
    df = df.sort_values("proba_maladie", ascending=False).head(top_n)

    df["proba_maladie"] = df["proba_maladie"].round(3)

    return df

threshold = final_clf.threshold  
# Features à afficher 
cols = ["Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol","FastingBS","RestingECG","MaxHR","ExerciseAngina","Oldpeak","ST_Slope","Cholesterol_missing"]  # à adapter feature clés 

top10 = scoring_heart(
    X_test, final_clf, threshold,
    y_true=y_test,
    top_n=10,
    cols_features=cols
)

display(top10)

import shap

# 1) Récupérer le préprocessing + modèle final
preprocess = best_rf_model.named_steps["preprocessing"]
model = best_rf_model.named_steps["model"]

# 2) Transformer les données
X_train_proc = preprocess.transform(X_train)
X_test_proc = preprocess.transform(X_test)

# 3) Convertir en dense si besoin
if hasattr(X_train_proc, "toarray"):
    X_train_proc = X_train_proc.toarray()
    X_test_proc = X_test_proc.toarray()

# 4) Récupérer les noms des variables transformées
feature_names = preprocess.get_feature_names_out()

# 5) SHAP TreeExplainer pour Random Forest
explainer = shap.TreeExplainer(model)

# 6) Valeurs SHAP sur le test
shap_values = explainer.shap_values(X_test_proc)

# Pour classification binaire, on récupère la classe 1
shap_values_class_1 = shap_values[:, :, 1]

# 7) Beeeswarm plot
shap.summary_plot(
    shap_values_class_1,
    X_test_proc,
    feature_names=feature_names,
    max_display=20
)

type(shap_values), getattr(shap_values,"shape",None)

(numpy.ndarray, (184, 21, 2))

shap_class1 = shap.Explanation(
    values=shap_values[:, :, 1],
    data=X_test_proc,
    feature_names=feature_names
)

shap.plots.scatter(
    shap_class1[:, "num__MaxHR"],
    color=shap_class1[:, "num__Age"],
    alpha=0.6,
    dot_size=30,
    show=False
)

plt.title("Impact de MaxHR sur la prédiction, coloré par l'âge")
plt.show()

shap.plots.scatter(
    shap_class1[:, "num__Oldpeak"],
    color=shap_class1[:, "cat_nom__ST_Slope_Flat"],
    alpha=0.6,
    dot_size=30,
    show=False
)

plt.title("Impact de Oldpeak sur la prédiction, coloré par ST_Slope_Flat")
plt.show()

probas = best_rf_model.predict_proba(X_test)[:, 1]

idx_high = probas.argmax()   # plus fort risque
idx_low = probas.argmin()    # plus faible risque

print("Proba max :", probas[idx_high])
print("Proba min :", probas[idx_low])

Proba max : 0.9819412239707195
Proba min : 0.005985911159452335

if isinstance(shap_values, list):
    shap_vals_class1 = shap_values[1]
else:
    shap_vals_class1 = shap_values

print("Probabilité prédite :", probas[idx_high])

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[idx_high, :, 1],
        base_values=explainer.expected_value[1],
        data=X_test_proc[idx_high],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.9819412239707195

print("Probabilité prédite :", probas[idx_low])

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[idx_low, :, 1],
        base_values=explainer.expected_value[1],
        data=X_test_proc[idx_low],
        feature_names=feature_names
    ),
    max_display=10
)

Probabilité prédite : 0.005985911159452335

probas = model.predict_proba(X_test_proc)[:,1]

idx_border = np.argmin(np.abs(probas - best_threshold))

proba_border = probas[idx_border]

pred_border = int(proba_border >= best_threshold)

print("Probabilité :", proba_border)
print("Seuil :", best_threshold)
print("Vraie classe :", y_test.iloc[idx_border])
print("Classe prédite :", pred_border)

Probabilité : 0.477421727458697
Seuil : 0.477421727458697
Vraie classe : 1
Classe prédite : 1

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[idx_border, :, 1],
        base_values=explainer.expected_value[1],
        data=X_test_proc[idx_border],
        feature_names=feature_names
    ),
    max_display=10
)

probas = model.predict_proba(X_test_proc)[:, 1]

preds = (probas >= best_threshold).astype(int)

# masque des erreurs
errors = preds != y_test.values

# parmi les erreurs, prendre celle la plus proche du seuil
idx_border_wrong = np.where(errors)[0][
    np.argmin(np.abs(probas[errors] - best_threshold))
]

proba_border_wrong = probas[idx_border_wrong]
pred_border_wrong = preds[idx_border_wrong]

print("Probabilité :", proba_border_wrong)
print("Seuil :", best_threshold)
print("Vraie classe :", y_test.iloc[idx_border_wrong])
print("Classe prédite :", pred_border_wrong)
print("Distance au seuil :", proba_border_wrong - best_threshold)

Probabilité : 0.5235982302650032
Seuil : 0.477421727458697
Vraie classe : 0
Classe prédite : 1
Distance au seuil : 0.0461765028063062

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[idx_border_wrong, :, 1],
        base_values=explainer.expected_value[1],
        data=X_test_proc[idx_border_wrong],
        feature_names=feature_names
    ),
    max_display=10
)

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease	Cholesterol_missing
0	40	M	ATA	140.0	289.0	0	Normal	172	N	0.0	Up	0	0
1	49	F	NAP	160.0	180.0	0	Normal	156	N	1.0	Flat	1	0
2	37	M	ATA	130.0	283.0	0	ST	98	N	0.0	Up	0	0
3	48	F	ASY	138.0	214.0	0	Normal	108	Y	1.5	Flat	1	0
4	54	M	NAP	150.0	195.0	0	Normal	122	N	0.0	Up	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
913	45	M	TA	110.0	264.0	0	Normal	132	N	1.2	Flat	1	0
914	68	M	ASY	144.0	193.0	1	Normal	141	N	3.4	Flat	1	0
915	57	M	ASY	130.0	131.0	0	Normal	115	Y	1.2	Flat	1	0
916	57	F	ATA	130.0	236.0	0	LVH	174	N	0.0	Flat	1	0
917	38	M	NAP	138.0	175.0	0	Normal	173	N	0.0	Up	0	0

	run_id	experiment_id	status	artifact_uri	start_time	end_time	metrics.test_recall	metrics.test_f1	metrics.cv_accuracy	metrics.cv_precision	metrics.test_accuracy	metrics.test_precision	metrics.cv_roc_auc	metrics.test_roc_auc	metrics.cv_f1	metrics.cv_recall	metrics.test_f2	metrics.cv_f2	params.model_name	tags.mlflow.user	tags.mlflow.source.name	tags.mlflow.source.type	tags.mlflow.runName
0	59341c8111df4990a70bb8e3a245720f	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:32:38.529000+00:00	2026-04-25 13:32:45.323000+00:00	0.872549	0.881188	0.867859	0.884873	0.869565	0.890000	0.925658	0.922286	0.880368	0.876874	0.875984	0.878151	XGBoost_Balanced	bouzo	notebook_modelisation.ipynb	NOTEBOOK	XGBoost_Balanced
1	92b797a9718446daba60558faf4aaf8a	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:32:31.471000+00:00	2026-04-25 13:32:38.521000+00:00	0.872549	0.881188	0.865138	0.878205	0.869565	0.890000	0.926564	0.922884	0.878470	0.879344	0.875984	0.878919	XGBoost	bouzo	notebook_modelisation.ipynb	NOTEBOOK	XGBoost
2	83718a7f033f47598815ec7215b2d58c	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:32:21.574000+00:00	2026-04-25 13:32:31.464000+00:00	0.941176	0.918660	0.859715	0.858237	0.907609	0.897196	0.929142	0.938187	0.876495	0.899036	0.932039	0.889472	RandomForest_Balanced	bouzo	notebook_modelisation.ipynb	NOTEBOOK	RandomForest_Balanced
3	1fcebb0b6796425084e6927846e9d430	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:32:09.569000+00:00	2026-04-25 13:32:21.567000+00:00	0.941176	0.918660	0.856994	0.858815	0.907609	0.897196	0.928061	0.937231	0.873288	0.891629	0.932039	0.883809	RandomForest	bouzo	notebook_modelisation.ipynb	NOTEBOOK	RandomForest
4	0653d983b8414d919e479ea3ee90d280	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:32:02.625000+00:00	2026-04-25 13:32:09.559000+00:00	0.911765	0.889952	0.854273	0.862344	0.875000	0.869159	0.924334	0.935916	0.870187	0.879344	0.902913	0.875514	SVC_Balanced	bouzo	notebook_modelisation.ipynb	NOTEBOOK	SVC_Balanced
5	f9ef6aabe48e4ba7a8a39a877d430913	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:31:55.670000+00:00	2026-04-25 13:32:02.613000+00:00	0.921569	0.895238	0.852903	0.853150	0.880435	0.870370	0.925610	0.936872	0.871116	0.891689	0.910853	0.883113	SVC	bouzo	notebook_modelisation.ipynb	NOTEBOOK	SVC
6	51da173c4dec4a0699510f20ab4708b5	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:31:45.230000+00:00	2026-04-25 13:31:55.655000+00:00	0.892157	0.900990	0.852912	0.863726	0.891304	0.910000	0.927447	0.933883	0.868538	0.874466	0.895669	0.871952	LogisticRegression_Balanced	bouzo	notebook_modelisation.ipynb	NOTEBOOK	LogisticRegression_Balanced
7	78bfff622322450bad57be4cad22dcff	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:31:32.400000+00:00	2026-04-25 13:31:45.220000+00:00	0.921569	0.908213	0.854254	0.859422	0.896739	0.895238	0.927750	0.934242	0.870910	0.884312	0.916179	0.878699	LogisticRegression	bouzo	notebook_modelisation.ipynb	NOTEBOOK	LogisticRegression
8	da5f6c258e3b48fab70a85c2c2486114	212082661121986186	FINISHED	file:///C:/Users/bouzo/Desktop/Projet_6_IA_130...	2026-04-25 13:31:18.971000+00:00	2026-04-25 13:31:32.388000+00:00	1.000000	0.713287	0.553136	0.553136	0.554348	0.554348	0.500000	0.500000	0.712279	1.000000	0.861486	0.860896	Dummy	bouzo	notebook_modelisation.ipynb	NOTEBOOK	Dummy

	model	cv_recall	cv_f2	cv_f1	cv_roc_auc	test_recall	test_f2	test_f1	test_roc_auc	test_precision	test_accuracy
6	RandomForest_Balanced	0.899036	0.889472	0.876495	0.929142	0.941176	0.932039	0.918660	0.938187	0.897196	0.907609
5	RandomForest	0.891629	0.883809	0.873288	0.928061	0.941176	0.932039	0.918660	0.937231	0.897196	0.907609
1	LogisticRegression	0.884312	0.878699	0.870910	0.927750	0.921569	0.916179	0.908213	0.934242	0.895238	0.896739
3	SVC	0.891689	0.883113	0.871116	0.925610	0.921569	0.910853	0.895238	0.936872	0.870370	0.880435
4	SVC_Balanced	0.879344	0.875514	0.870187	0.924334	0.911765	0.902913	0.889952	0.935916	0.869159	0.875000
2	LogisticRegression_Balanced	0.874466	0.871952	0.868538	0.927447	0.892157	0.895669	0.900990	0.933883	0.910000	0.891304
7	XGBoost	0.879344	0.878919	0.878470	0.926564	0.872549	0.875984	0.881188	0.922884	0.890000	0.869565
8	XGBoost_Balanced	0.876874	0.878151	0.880368	0.925658	0.872549	0.875984	0.881188	0.922286	0.890000	0.869565
0	Dummy	1.000000	0.860896	0.712279	0.500000	1.000000	0.861486	0.713287	0.500000	0.554348	0.554348

Projet : Système d'aide au dépistage précoce du risque de maladie cardiaque

Notebook 2 : Modelisation

Section 1 : Préparatation des donnnées en vu de la modélisation

Importation dataset¶

Séparation en X/y¶

Préprocessing¶

Split train/test (stratifié)¶

Pipeline pour chaque modèle¶

Section 2 : Modélisation

Imports métriques + MLflow¶

Dictionnaire des modèles¶

Scoring validation croisée¶

Fonction pour évaluer + logger dans MLflow¶

Tableau comparatif + sauvegarde du tableau¶

Modèle retenu : RandomForest_Balanced¶

Optimisation des hyperparamètres par GridSearchCV¶

Section 3 : Sauvegarde du modèle

Fonction de scoring (démonstration, exemple d'usage)¶

Section 4 : Interprétabilité & Résultats

Interprétabilité des métriques¶

Performance globale du modèle¶

Compromis précision/recall et implications métier¶

Choix du F2-score comme métrique principale¶

Interprétation globale :¶

SHAP Beeswarm plot¶

Scatter SHAP (relation feature/impact)¶

Interprétation locale (SHAP) : Waterfall plot¶

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	Cholesterol_missing	proba_maladie	prediction	niveau_risque	decision	y_true	resultat
575	56	M	ASY	137.0	282.0	1	Normal	126	Y	1.2	Flat	0	0.982	1	Très élevé	A risque (Malade=1)	1	✅ OK
535	56	M	ASY	130.0	237.0	0	LVH	122	Y	1.0	Flat	1	0.980	1	Très élevé	A risque (Malade=1)	1	✅ OK
442	51	M	ASY	128.0	237.0	1	ST	125	Y	1.2	Flat	1	0.977	1	Très élevé	A risque (Malade=1)	1	✅ OK
460	57	M	ASY	139.0	277.0	1	ST	118	Y	1.9	Flat	0	0.976	1	Très élevé	A risque (Malade=1)	1	✅ OK
412	56	M	ASY	125.0	237.0	1	Normal	103	Y	1.0	Flat	1	0.975	1	Très élevé	A risque (Malade=1)	1	✅ OK
484	65	M	ASY	134.0	237.0	0	Normal	112	Y	1.1	Flat	1	0.973	1	Très élevé	A risque (Malade=1)	1	✅ OK
347	48	M	ASY	115.0	237.0	1	Normal	128	N	0.0	Flat	1	0.971	1	Très élevé	A risque (Malade=1)	1	✅ OK
353	58	M	ASY	130.0	237.0	0	ST	100	Y	1.0	Flat	1	0.970	1	Très élevé	A risque (Malade=1)	1	✅ OK
414	54	M	ASY	130.0	237.0	1	Normal	110	Y	3.0	Flat	1	0.967	1	Très élevé	A risque (Malade=1)	1	✅ OK
454	58	M	ASY	136.0	203.0	1	Normal	123	Y	1.2	Flat	0	0.962	1	Très élevé	A risque (Malade=1)	1	✅ OK