Notebook 2 : Modélisation¶
Import du dataset¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
pd.set_option("display.max_columns",200)
pd.set_option("display.max_rows",200)
building_df = pd.read_csv("../data/processed/building_df_cleaned.csv")
building_df
| OSEBuildingID | Neighborhood | NumberofBuildings | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | LargestPropertyUseTypeGFA | SiteEnergyUse(kBtu) | TotalGHGEmissions | has_secondary_use | BuildingAge | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | DOWNTOWN | 1.0 | 0 | 88434 | Hotel | 88434.0 | 7.226362e+06 | 249.98 | 0 | 89 |
| 1 | 2 | DOWNTOWN | 1.0 | 15064 | 88502 | Hotel | 83880.0 | 8.387933e+06 | 295.86 | 1 | 20 |
| 2 | 3 | DOWNTOWN | 1.0 | 196718 | 759392 | Hotel | 756493.0 | 7.258702e+07 | 2089.28 | 0 | 47 |
| 3 | 5 | DOWNTOWN | 1.0 | 0 | 61320 | Hotel | 61320.0 | 6.794584e+06 | 286.43 | 0 | 90 |
| 4 | 8 | DOWNTOWN | 1.0 | 62000 | 113580 | Hotel | 123445.0 | 1.417261e+07 | 505.01 | 1 | 36 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1609 | 50222 | GREATER DUWAMISH | 1.0 | 0 | 12294 | Office | 12294.0 | 8.497457e+05 | 20.94 | 0 | 26 |
| 1610 | 50223 | DOWNTOWN | 1.0 | 0 | 16000 | Other - Recreation | 16000.0 | 9.502762e+05 | 32.17 | 0 | 12 |
| 1611 | 50224 | MAGNOLIA / QUEEN ANNE | 1.0 | 0 | 13157 | Other - Recreation | 7583.0 | 5.765898e+06 | 223.54 | 1 | 42 |
| 1612 | 50225 | GREATER DUWAMISH | 1.0 | 0 | 14101 | Other - Recreation | 6601.0 | 7.194712e+05 | 22.11 | 1 | 27 |
| 1613 | 50226 | GREATER DUWAMISH | 1.0 | 0 | 18258 | Other - Recreation | 8271.0 | 1.152896e+06 | 41.27 | 1 | 78 |
1614 rows × 11 columns
#statistique descriptive des variables cibles
building_df[["SiteEnergyUse(kBtu)", "TotalGHGEmissions"]].describe()
| SiteEnergyUse(kBtu) | TotalGHGEmissions | |
|---|---|---|
| count | 1.614000e+03 | 1614.000000 |
| mean | 6.418514e+06 | 131.319907 |
| std | 1.008937e+07 | 237.571748 |
| min | 5.713320e+04 | 0.000000 |
| 25% | 1.261785e+06 | 20.645000 |
| 50% | 2.563995e+06 | 49.675000 |
| 75% | 6.796881e+06 | 134.815000 |
| max | 7.258702e+07 | 2573.750000 |
Import des modules¶
#Selection
from sklearn.model_selection import (
train_test_split,
GridSearchCV,
cross_validate,
KFold
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler #LabelEncoder (non utilisé ici)
from sklearn.pipeline import Pipeline
#Modèles
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
Séparation du jeu de données en X/y¶
Dans cette phase, l’objectif est de construire des modèles prédictifs permettant d’estimer :
la consommation énergétique annuelle des bâtiments non résidentiels (SiteEnergyUse(kBtu)),
ainsi que leurs émissions totales de gaz à effet de serre (TotalGHGEmissions).
Ces deux variables cibles sont modélisées séparément, car elles représentent des phénomènes distincts bien que corrélés.
# Séparation en X/y
# Définition des variables cibles (target)
y_energy = building_df["SiteEnergyUse(kBtu)"]
y_ghg = building_df["TotalGHGEmissions"]
# Définition des variables explicatives (features) et de l'ID
X = building_df.drop(
columns=[
"SiteEnergyUse(kBtu)",
"TotalGHGEmissions",
"OSEBuildingID"
]
)
print("Shape de X :", X.shape)
print("Shape de y_energy :", y_energy.shape)
print("Shape de y_ghg :", y_ghg.shape)
Shape de X : (1614, 8) Shape de y_energy : (1614,) Shape de y_ghg : (1614,)
X.info()
<class 'pandas.DataFrame'> RangeIndex: 1614 entries, 0 to 1613 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Neighborhood 1614 non-null str 1 NumberofBuildings 1614 non-null float64 2 PropertyGFAParking 1614 non-null int64 3 PropertyGFABuilding(s) 1614 non-null int64 4 LargestPropertyUseType 1614 non-null str 5 LargestPropertyUseTypeGFA 1614 non-null float64 6 has_secondary_use 1614 non-null int64 7 BuildingAge 1614 non-null int64 dtypes: float64(2), int64(4), str(2) memory usage: 101.0 KB
Préparation des features pour la modélisation¶
Définir colonnes numériques vs catégorielles
categorical_features = ["Neighborhood", "LargestPropertyUseType"]
numeric_features = [
"NumberofBuildings",
"PropertyGFAParking",
"PropertyGFABuilding(s)",
"LargestPropertyUseTypeGFA",
"has_secondary_use",
"BuildingAge"
]
X["Neighborhood"].nunique(), X["LargestPropertyUseType"].nunique()
(19, 55)
X["LargestPropertyUseType"].value_counts().tail(10)
LargestPropertyUseType Other - Utility 2 Pre-school/Daycare 2 Police Station 1 Courthouse 1 Wholesale Club/Supercenter 1 Fire Station 1 Residential Care Facility 1 Food Service 1 Movie Theater 1 Personal Services (Health/Beauty, Dry Cleaning, etc) 1 Name: count, dtype: int64
Fonction générique d'évaluation Validation Croisée (CV)¶
import time
def evaluate_model_cv(pipeline, X, y, n_splits=5):
"""
pipeline : sklearn Pipeline déjà construit
X : features
y : target (NON log)
"""
y_log = np.log1p(y)
cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
r2_log_scores = []
mae_real_scores = []
rmse_real_scores = []
start_time = time.time()
for train_idx, test_idx in cv.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train_log = y_log.iloc[train_idx]
y_test_log = y_log.iloc[test_idx]
pipeline.fit(X_train, y_train_log)
y_pred_log = pipeline.predict(X_test)
# R² en log
r2_log_scores.append(r2_score(y_test_log, y_pred_log))
# Retour en échelle réelle
y_test_real = np.expm1(y_test_log)
y_pred_real = np.expm1(y_pred_log)
mae_real_scores.append(mean_absolute_error(y_test_real, y_pred_real))
rmse_real_scores.append(np.sqrt(mean_squared_error(y_test_real, y_pred_real)))
total_time = time.time() - start_time
return {
"R2_log_mean": np.mean(r2_log_scores),
"R2_log_std": np.std(r2_log_scores),
"MAE_real_mean": np.mean(mae_real_scores),
"RMSE_real_mean": np.mean(rmse_real_scores),
"training_time_sec": total_time
}
Modélisation¶
Regression Linéaire Multivariée (Baseline Model)¶
preprocessor_lr = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
("num", StandardScaler(), numeric_features),
]
)
lr_pipeline = Pipeline([
("preprocessor", preprocessor_lr),
("model", LinearRegression())
])
# Évaluation energy
lr_results_energy = evaluate_model_cv(lr_pipeline, X, y_energy)
print("Résultat pour l'énergie : ")
print(lr_results_energy)
# Evaluation CO2
lr_results_ghg = evaluate_model_cv(lr_pipeline, X, y_ghg)
print("Résultat pour le CO2 : ")
print(lr_results_ghg)
Résultat pour l'énergie :
{'R2_log_mean': np.float64(0.5452413607780278), 'R2_log_std': np.float64(0.039186207785478454), 'MAE_real_mean': np.float64(7817846.566596624), 'RMSE_real_mean': np.float64(65608264.40706185), 'training_time_sec': 0.23400044441223145}
Résultat pour le CO2 :
{'R2_log_mean': np.float64(0.38849787528459423), 'R2_log_std': np.float64(0.032650862582695596), 'MAE_real_mean': np.float64(137.42047815700948), 'RMSE_real_mean': np.float64(809.5965539599918), 'training_time_sec': 0.23362016677856445}
Random Forest¶
preprocessor = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
("num", "passthrough", numeric_features),
]
)
rf_pipeline = Pipeline([
("preprocessor", preprocessor),
("model", RandomForestRegressor(
n_estimators=300,
random_state=42,
n_jobs=-1
))
])
# Évaluation energy
rf_results_energy = evaluate_model_cv(rf_pipeline, X, y_energy)
print("Résultat pour l'énergie : ")
print(rf_results_energy)
# Evaluation CO2
rf_results_ghg = evaluate_model_cv(rf_pipeline, X, y_ghg)
print("Résultat pour le CO2 : ")
print(rf_results_ghg)
Résultat pour l'énergie :
{'R2_log_mean': np.float64(0.6863872485084938), 'R2_log_std': np.float64(0.036281169909893794), 'MAE_real_mean': np.float64(2671515.690922323), 'RMSE_real_mean': np.float64(5968867.91934103), 'training_time_sec': 8.432215213775635}
Résultat pour le CO2 :
{'R2_log_mean': np.float64(0.4742906147792091), 'R2_log_std': np.float64(0.04401749391477097), 'MAE_real_mean': np.float64(78.70998326675934), 'RMSE_real_mean': np.float64(183.67314062913835), 'training_time_sec': 8.10239863395691}
Gradient Boosting¶
preprocessor_gb = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
("num", "passthrough", numeric_features),
]
)
gb_pipeline = Pipeline([
("preprocessor", preprocessor_gb),
("model", GradientBoostingRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=3,
random_state=42
))
])
# Évaluation energy
gb_results_energy = evaluate_model_cv(gb_pipeline, X, y_energy)
print("Résultat pour l'énergie : ")
print(gb_results_energy)
# Evaluation CO2
gb_results_ghg = evaluate_model_cv(gb_pipeline, X, y_ghg)
print("Résultat pour le CO2 : ")
print(gb_results_ghg)
Résultat pour l'énergie :
{'R2_log_mean': np.float64(0.7080175138072093), 'R2_log_std': np.float64(0.033466060166330115), 'MAE_real_mean': np.float64(2611702.8176943255), 'RMSE_real_mean': np.float64(5902730.207168619), 'training_time_sec': 8.930659532546997}
Résultat pour le CO2 :
{'R2_log_mean': np.float64(0.5074585498106733), 'R2_log_std': np.float64(0.04280350230559401), 'MAE_real_mean': np.float64(76.66323446465908), 'RMSE_real_mean': np.float64(175.27504936415713), 'training_time_sec': 7.926228284835815}
SVM¶
preprocessor_svm = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
("num", StandardScaler(), numeric_features),
]
)
svm_pipeline = Pipeline([
("preprocessor", preprocessor_svm),
("model", SVR(
kernel="rbf",
C=10,
epsilon=0.1
))
])
# Évaluation energy
svm_results_energy = evaluate_model_cv(svm_pipeline, X, y_energy)
print("Résultat pour l'énergie : ")
print(svm_results_energy)
# Evaluation CO2
svm_results_ghg = evaluate_model_cv(svm_pipeline, X, y_ghg)
print("Résultat pour le CO2 : ")
print(svm_results_ghg)
Résultat pour l'énergie :
{'R2_log_mean': np.float64(0.6724536642221711), 'R2_log_std': np.float64(0.021602514841200928), 'MAE_real_mean': np.float64(3017177.4815097847), 'RMSE_real_mean': np.float64(6827361.388652851), 'training_time_sec': 2.090670347213745}
Résultat pour le CO2 :
{'R2_log_mean': np.float64(0.46177980098751226), 'R2_log_std': np.float64(0.02074722185073048), 'MAE_real_mean': np.float64(84.15247598892276), 'RMSE_real_mean': np.float64(191.29799499042284), 'training_time_sec': 2.2301695346832275}
Comparaison de différents modèles supervisés¶
results_df_energy = pd.DataFrame({
"LinearRegression": lr_results_energy,
"RandomForest": rf_results_energy,
"GradientBoosting": gb_results_energy,
"SVM": svm_results_energy
}).T
results_df_energy
| R2_log_mean | R2_log_std | MAE_real_mean | RMSE_real_mean | training_time_sec | |
|---|---|---|---|---|---|
| LinearRegression | 0.545241 | 0.039186 | 7.817847e+06 | 6.560826e+07 | 0.234000 |
| RandomForest | 0.686387 | 0.036281 | 2.671516e+06 | 5.968868e+06 | 8.432215 |
| GradientBoosting | 0.708018 | 0.033466 | 2.611703e+06 | 5.902730e+06 | 8.930660 |
| SVM | 0.672454 | 0.021603 | 3.017177e+06 | 6.827361e+06 | 2.090670 |
results_df_ghg = pd.DataFrame({
"LinearRegression": lr_results_ghg,
"RandomForest": rf_results_ghg,
"GradientBoosting": gb_results_ghg,
"SVM": svm_results_ghg
}).T
results_df_ghg
| R2_log_mean | R2_log_std | MAE_real_mean | RMSE_real_mean | training_time_sec | |
|---|---|---|---|---|---|
| LinearRegression | 0.388498 | 0.032651 | 137.420478 | 809.596554 | 0.233620 |
| RandomForest | 0.474291 | 0.044017 | 78.709983 | 183.673141 | 8.102399 |
| GradientBoosting | 0.507459 | 0.042804 | 76.663234 | 175.275049 | 7.926228 |
| SVM | 0.461780 | 0.020747 | 84.152476 | 191.297995 | 2.230170 |
Optimisation et interprétation du modèle : Gradient Boosting pour l'énergie¶
# Train / Test split
X_train, X_test, y_train_energy, y_test_energy = train_test_split(
X, y_energy, test_size=0.2, random_state=42
)
# Log sur la target y_energy
y_train_log = np.log1p(y_train_energy)
# CV interne sur le train
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# =========================
# =========================
# Grille d'hyperparamètres
param_grid_gb = {
"model__n_estimators": [200, 300, 500],
"model__learning_rate": [0.03, 0.05, 0.1],
"model__max_depth": [1, 2, 3],
"model__min_samples_leaf": [1, 3],
"model__subsample": [0.7, 0.8]
}
# GridSearchCV
grid_gb = GridSearchCV(
estimator=gb_pipeline,
param_grid=param_grid_gb,
scoring="r2", # optimisation en log via y_train_log
cv=cv,
n_jobs=-1,
verbose=2,
refit=True
)
grid_gb.fit(X_train, y_train_log)
print(" Best CV R² (log):", grid_gb.best_score_)
print(" Best params:", grid_gb.best_params_)
best_model_energy = grid_gb.best_estimator_
# =========================
# Évaluation finale sur test (échelle réelle)
y_pred_log = best_model_energy.predict(X_test)
# R² en log
y_test_log = np.log1p(y_test_energy)
r2_log_test = r2_score(y_test_log, y_pred_log)
# Retour en échelle réelle
y_pred_energy = np.expm1(y_pred_log)
rmse = np.sqrt(mean_squared_error(y_test_energy, y_pred_energy))
mae = mean_absolute_error(y_test_energy, y_pred_energy)
r2_real = r2_score(y_test_energy, y_pred_energy)
print("\n=== Final Test Metrics (REAL scale) pour l'énergie ===")
print(f"R² log : {r2_log_test:.3f}")
print(f"R² réel : {r2_real:.3f}")
print(f"RMSE réel: {rmse:.2f}")
print(f"MAE réel : {mae:.2f}")
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best CV R² (log): 0.7102370489427285
Best params: {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__min_samples_leaf': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
=== Final Test Metrics (REAL scale) pour l'énergie ===
R² log : 0.739
R² réel : 0.587
RMSE réel: 6310232.29
MAE réel : 2671917.56
Interprétation
mae_relative = mae / y_energy.mean()
rmse_relative = rmse / y_energy.mean()
print("=== Analyse des performances du modèle (énergie) ===")
print(f"R² : {r2_real:.3f}")
print(f"Erreur moyenne absolue : {mae:,.0f} kBtu ({mae_relative:.1%} de la moyenne)")
print(f"Erreur quadratique moyenne : {rmse:,.0f} kBtu ({rmse_relative:.1%} de la moyenne)")
=== Analyse des performances du modèle (énergie) === R² : 0.587 Erreur moyenne absolue : 2,671,918 kBtu (41.6% de la moyenne) Erreur quadratique moyenne : 6,310,232 kBtu (98.3% de la moyenne)
Feature importance
# Récupérer le meilleur pipeline
best_model_energy = grid_gb.best_estimator_
# Récupérer le modèle GB
gb = best_model_energy.named_steps["model"]
# Récupérer le preprocess
preprocessor = best_model_energy.named_steps["preprocessor"]
# Récupérer les noms des features après OneHotEncoder
feature_names = preprocessor.get_feature_names_out()
# Récupérer les importances
importances = gb.feature_importances_
# Créer DataFrame
feat_imp = pd.DataFrame({
"feature": feature_names,
"importance": importances
})
# Trier
feat_imp = feat_imp.sort_values(by="importance", ascending=False)
feat_imp.head(15)
| feature | importance | |
|---|---|---|
| 72 | num__PropertyGFABuilding(s) | 0.433674 |
| 73 | num__LargestPropertyUseTypeGFA | 0.261422 |
| 38 | cat__LargestPropertyUseType_Non-Refrigerated W... | 0.070972 |
| 66 | cat__LargestPropertyUseType_Supermarket/Grocer... | 0.041725 |
| 62 | cat__LargestPropertyUseType_Self-Storage Facility | 0.033848 |
| 24 | cat__LargestPropertyUseType_Distribution Center | 0.019815 |
| 71 | num__PropertyGFAParking | 0.018367 |
| 75 | num__BuildingAge | 0.015541 |
| 74 | num__has_secondary_use | 0.014761 |
| 31 | cat__LargestPropertyUseType_Laboratory | 0.011906 |
| 69 | cat__LargestPropertyUseType_Worship Facility | 0.011073 |
| 60 | cat__LargestPropertyUseType_Restaurant | 0.010156 |
| 30 | cat__LargestPropertyUseType_K-12 School | 0.009329 |
| 46 | cat__LargestPropertyUseType_Other - Recreation | 0.005418 |
| 8 | cat__Neighborhood_GREATER DUWAMISH | 0.005211 |
plt.figure(figsize=(10,6))
plt.barh(feat_imp["feature"][:15], feat_imp["importance"][:15])
plt.gca().invert_yaxis()
plt.title("Top 15 Feature Importances - Gradient Boosting")
plt.show()
feat_imp["group"] = feat_imp["feature"].apply(
lambda x: "LargestPropertyUseType"
if "LargestPropertyUseType_" in x else
"Neighborhood"
if "Neighborhood_" in x else
"Numeric"
)
grouped_imp = feat_imp.groupby("group")["importance"].sum().sort_values(ascending=False)
grouped_imp
group Numeric 0.746283 LargestPropertyUseType 0.236422 Neighborhood 0.017295 Name: importance, dtype: float64
plt.figure(figsize=(6,4))
grouped_imp.plot(kind="bar")
plt.title("Importance par groupe de variables")
plt.ylabel("Importance totale")
plt.show()
def group_feature(feature):
if feature.startswith("num__PropertyGFABuilding"):
return "Surface bâtiment"
elif feature.startswith("num__LargestPropertyUseTypeGFA"):
return "Surface usage principal"
elif feature.startswith("num__PropertyGFAParking"):
return "Surface parking"
elif feature.startswith("num__BuildingAge"):
return "Âge bâtiment"
elif feature.startswith("num__has_secondary_use"):
return "Usage secondaire"
elif "LargestPropertyUseType_" in feature:
return "Type usage principal"
elif "Neighborhood_" in feature:
return "Quartier"
else:
return "Nombre de bâtiment"
feat_imp["group_clean"] = feat_imp["feature"].apply(group_feature)
grouped_imp_clean = (
feat_imp.groupby("group_clean")["importance"]
.sum()
.sort_values(ascending=False)
)
grouped_imp_clean
group_clean Surface bâtiment 0.433674 Surface usage principal 0.261422 Type usage principal 0.236422 Surface parking 0.018367 Quartier 0.017295 Âge bâtiment 0.015541 Usage secondaire 0.014761 Nombre de bâtiment 0.002519 Name: importance, dtype: float64
plt.figure(figsize=(8,5))
grouped_imp_clean.plot(kind="bar")
plt.title("Importance par variable métier")
plt.ylabel("Importance totale")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
L’analyse des importances met en évidence que la surface du bâtiment constitue le facteur prédominant dans la prédiction de la consommation énergétique et des émissions.
Le type d’usage principal joue également un rôle significatif.
Les variables de localisation et de structure secondaire contribuent de manière marginale, suggérant que la taille et la fonction du bâtiment expliquent l’essentiel de la variance observée.
def predict_real_from_log_model(pipeline, X):
"""
pipeline : sklearn Pipeline entraîné sur y_log = log1p(y)
Retour : y_pred en échelle réelle (expm1)
"""
y_pred_log = pipeline.predict(X)
y_pred_real = np.expm1(y_pred_log)
# sécurité si jamais petites valeurs négatives numériques
y_pred_real = np.clip(y_pred_real, 0, None)
return y_pred_real, y_pred_log
def plot_pred_vs_real(pipeline, X_test, y_test_real, title="Prédits vs Réels"):
y_pred_real, y_pred_log = predict_real_from_log_model(pipeline, X_test)
# --- Plot en échelle réelle ---
plt.figure(figsize=(7,7))
plt.scatter(y_test_real, y_pred_real, alpha=0.5)
minv = min(y_test_real.min(), y_pred_real.min())
maxv = max(y_test_real.max(), y_pred_real.max())
plt.plot([minv, maxv], [minv, maxv]) # diagonale y=x
plt.xlabel("Valeurs réelles")
plt.ylabel("Valeurs prédites")
plt.title(title + " (échelle réelle)")
plt.tight_layout()
plt.show()
# --- Plot en échelle log ---
y_test_log = np.log1p(y_test_real)
plt.figure(figsize=(7,7))
plt.scatter(y_test_log, y_pred_log, alpha=0.5)
minv = min(y_test_log.min(), y_pred_log.min())
maxv = max(y_test_log.max(), y_pred_log.max())
plt.plot([minv, maxv], [minv, maxv])
plt.xlabel("Réel (log1p)")
plt.ylabel("Prédit (log1p)")
plt.title(title + " (échelle log)")
plt.tight_layout()
plt.show()
# Exemple :
plot_pred_vs_real(best_model, X_test, y_test_energy, title="Énergie")
Optimisation et interprétation du modèle : Gradient Boosting pour le C02¶
# Train / Test split
X_train, X_test, y_train_ghg, y_test_ghg = train_test_split(
X, y_ghg, test_size=0.2, random_state=42
)
# Log sur la target y_ghg
y_train_log_ghg = np.log1p(y_train_ghg)
# CV interne sur le train
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# =========================
# =========================
# Grille d'hyperparamètres
param_grid_gb = {
"model__n_estimators": [200, 300, 500],
"model__learning_rate": [0.03, 0.05, 0.1],
"model__max_depth": [1, 2, 3],
"model__min_samples_leaf": [1, 3],
"model__subsample": [0.7, 0.8]
}
# GridSearchCV
grid_gb_ghg = GridSearchCV(
estimator=gb_pipeline,
param_grid=param_grid_gb,
scoring="r2", # optimisation en log via y_train_log_ghg
cv=cv,
n_jobs=-1,
verbose=2,
refit=True
)
grid_gb_ghg.fit(X_train, y_train_log_ghg)
print(" Best CV R² (log):", grid_gb_ghg.best_score_)
print(" Best params:", grid_gb_ghg.best_params_)
best_model_ghg = grid_gb_ghg.best_estimator_
# =========================
# Évaluation finale sur test (échelle réelle)
y_pred_log_ghg = best_model_ghg.predict(X_test)
# R² en log
y_test_log_ghg = np.log1p(y_test_ghg)
r2_log_test_ghg = r2_score(y_test_log_ghg, y_pred_log_ghg)
#retour à l'echelle réelle
y_pred_ghg = np.expm1(y_pred_log_ghg)
rmse_ghg = np.sqrt(mean_squared_error(y_test_ghg, y_pred_ghg))
mae_ghg = mean_absolute_error(y_test_ghg, y_pred_ghg)
r2_real_ghg = r2_score(y_test_ghg, y_pred_ghg)
print("\n=== Final Test Metrics (REAL scale) pour le CO2 ===")
print(f"R² log : {r2_log_test_ghg:.3f}")
print(f"R² réel : {r2_real_ghg:.3f}")
print(f"RMSE réel: {rmse_ghg:.2f}")
print(f"MAE réel : {mae_ghg:.2f}")
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best CV R² (log): 0.5141917086637996
Best params: {'model__learning_rate': 0.05, 'model__max_depth': 2, 'model__min_samples_leaf': 3, 'model__n_estimators': 300, 'model__subsample': 0.7}
=== Final Test Metrics (REAL scale) pour le CO2 ===
R² log : 0.539
R² réel : 0.428
RMSE réel: 155.83
MAE réel : 71.12
mae_relative = mae_ghg / y_ghg.mean()
rmse_relative = rmse_ghg / y_ghg.mean()
print("=== Analyse des performances du modèle (CO2) ===")
print(f"R² : {r2_real_ghg:.3f}")
print(f"Erreur moyenne absolue : {mae_ghg:,.2f} ({mae_relative:.1%} de la moyenne)")
print(f"Erreur quadratique moyenne : {rmse_ghg:,.2f} ({rmse_relative:.1%} de la moyenne)")
=== Analyse des performances du modèle (CO2) === R² : 0.428 Erreur moyenne absolue : 71.12 (54.2% de la moyenne) Erreur quadratique moyenne : 155.83 (118.7% de la moyenne)
Feature Importance
# Récupérer le meilleur pipeline
best_model_ghg = grid_gb_ghg.best_estimator_
# Récupérer le modèle GB
gb_ghg = best_model_ghg.named_steps["model"]
# Récupérer le preprocess
preprocessor_ghg = best_model_ghg.named_steps["preprocessor"]
# Récupérer les noms des features après OneHotEncoder
feature_names_ghg = preprocessor_ghg.get_feature_names_out()
# Récupérer les importances
importances_ghg = gb_ghg.feature_importances_
# Créer DataFrame
feat_imp_ghg = pd.DataFrame({
"feature": feature_names_ghg,
"importance": importances_ghg
})
# Trier
feat_imp_ghg = feat_imp_ghg.sort_values(by="importance", ascending=False)
feat_imp_ghg.head(15)
| feature | importance | |
|---|---|---|
| 72 | num__PropertyGFABuilding(s) | 0.424251 |
| 73 | num__LargestPropertyUseTypeGFA | 0.156068 |
| 38 | cat__LargestPropertyUseType_Non-Refrigerated W... | 0.083520 |
| 66 | cat__LargestPropertyUseType_Supermarket/Grocer... | 0.040964 |
| 39 | cat__LargestPropertyUseType_Office | 0.035649 |
| 75 | num__BuildingAge | 0.034694 |
| 62 | cat__LargestPropertyUseType_Self-Storage Facility | 0.032032 |
| 29 | cat__LargestPropertyUseType_Hotel | 0.027230 |
| 31 | cat__LargestPropertyUseType_Laboratory | 0.019771 |
| 60 | cat__LargestPropertyUseType_Restaurant | 0.018523 |
| 24 | cat__LargestPropertyUseType_Distribution Center | 0.018216 |
| 46 | cat__LargestPropertyUseType_Other - Recreation | 0.013956 |
| 74 | num__has_secondary_use | 0.012040 |
| 63 | cat__LargestPropertyUseType_Senior Care Community | 0.008663 |
| 70 | num__NumberofBuildings | 0.007123 |
plt.figure(figsize=(10,6))
plt.barh(feat_imp_ghg["feature"][:15], feat_imp_ghg["importance"][:15])
plt.gca().invert_yaxis()
plt.title("Top 15 Feature Importances - Gradient Boosting")
plt.show()
feat_imp_ghg["group"] = feat_imp_ghg["feature"].apply(
lambda x: "LargestPropertyUseType"
if "LargestPropertyUseType_" in x else
"Neighborhood"
if "Neighborhood_" in x else
"Numeric"
)
grouped_imp_ghg = feat_imp_ghg.groupby("group")["importance"].sum().sort_values(ascending=False)
grouped_imp_ghg
group Numeric 0.640692 LargestPropertyUseType 0.331804 Neighborhood 0.027504 Name: importance, dtype: float64
plt.figure(figsize=(6,4))
grouped_imp_ghg.plot(kind="bar")
plt.title("Importance par groupe de variables")
plt.ylabel("Importance totale")
plt.show()
feat_imp_ghg["group_clean"] = feat_imp_ghg["feature"].apply(group_feature)
grouped_imp_clean_ghg = (
feat_imp_ghg.groupby("group_clean")["importance"]
.sum()
.sort_values(ascending=False)
)
grouped_imp_clean_ghg
group_clean Surface bâtiment 0.424251 Type usage principal 0.331804 Surface usage principal 0.156068 Âge bâtiment 0.034694 Quartier 0.027504 Usage secondaire 0.012040 Nombre de bâtiment 0.007123 Surface parking 0.006517 Name: importance, dtype: float64
plt.figure(figsize=(8,5))
grouped_imp_clean_ghg.plot(kind="bar")
plt.title("Importance par variable métier")
plt.ylabel("Importance totale")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plot_pred_vs_real(best_model_ghg, X_test, y_test_ghg, title="ghg")