import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

#importation et affichage

billets = pd.read_csv('billets.csv', sep=";")
billets.head()

billets.info()
billets.describe().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB

def doublons(df): 
    print(len(df)-len(df.drop_duplicates()), 'doublons')

doublons(billets)

0 doublons

billets.isna().sum()

is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64

(billets["margin_low"].isna().sum() / len(billets))*100

2.466666666666667

corr=billets.drop(columns='is_genuine').corr() ##on garde seulement les variables numériques
sns.heatmap(corr, annot=True, cmap="Blues", fmt=".2f",vmin= -1, vmax=1, center=0)
plt.savefig("matriceCorr.png",dpi=300, bbox_inches="tight")
plt.show()

sns.pairplot(billets, y_vars='margin_low', x_vars=['diagonal','height_left','height_right','margin_up','length'])
plt.savefig("pairplot.png",dpi=300, bbox_inches="tight")
plt.show()

sns.histplot(billets['margin_low'], kde=True, color='blue')
plt.title("Distribution de margin_low avant imputation")
plt.xlabel("margin_low (mm)")
plt.ylabel("Fréquence")

plt.savefig("histo.png",dpi=300, bbox_inches="tight")
plt.show()

#corr=billets.drop(columns='is_genuine').corr(method='spearman')

#on retire les NaN (on apprend uniquement sur les données complètes)
billets_sans_NaN = billets.dropna(subset=['margin_low'])

#Variables explicatives 
X = billets_sans_NaN[['length','margin_up']]
#Variable cible 
y = billets_sans_NaN['margin_low']

#création et entraînement du modèle
model= LinearRegression()
model.fit(X,y)

#Lecture des coefficients
#coefficients (b1, b2)
coefs= pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefs)
#Intercept (b0)
print("Intercept:", model.intercept_)

#Vérification de la qualité du model
print("Score R² :", model.score(X,y))

           Coefficient
length       -0.461099
margin_up     0.331252
Intercept: 55.39568940242489
Score R² : 0.454277286393189

y_pred = model.predict(X) #application du modele 
residus = y - y_pred      #calcul des résidus

#Méthode 1 : déjà effectué 

#Méthode 2
sns.scatterplot(x=y_pred, y=residus, color="royalblue")
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Valeurs prédites")
plt.ylabel("Résidus")
plt.title("Analyse des résidus du modèle linéaire")
plt.savefig("ScatterplotResidus.png",dpi=300, bbox_inches="tight")
plt.show()

from statsmodels.stats.outliers_influence import variance_inflation_factor

X = sm.add_constant(billets_sans_NaN[['margin_up','length']])
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

[25645.730848214487, 1.3728452237329334, 1.3728452237329336]

from statsmodels.stats.stattools import durbin_watson

dw = durbin_watson(residus)
print("Durbin–Watson :", dw)

Durbin–Watson : 1.8747486051663327

#Méthode 1 déjà éffectué. 

#Méthode 2: 
from statsmodels.stats.diagnostic import het_breuschpagan

X = sm.add_constant(billets_sans_NaN[['length','margin_up']])  # X : variables explicatives (avec constante)
y = billets_sans_NaN['margin_low'] # variable cible

# Modèle de régression linéaire
model = sm.OLS(y, X).fit()

# Test de Breusch–Pagan
bp_test = het_breuschpagan(model.resid, model.model.exog)

# Résultats
labels = ['Statistique LM', 'p-value LM', 'Statistique F', 'p-value F'] 
print(dict(zip(labels, bp_test)))

{'Statistique LM': 69.18185711374537, 'p-value LM': 9.49184426089401e-16, 'Statistique F': 36.23338952128672, 'p-value F': 4.3863636385048543e-16}

#option 1 : model_robust = model_ols.get_robustcov_results(cov_type='HC3')
#model_robust.summary()
#option2 : billets_sans_nan['log_margin_low'] = np.log(billets_sans_nan['margin_low'])

#methode 1 
sns.histplot(residus, kde=True, color="blue")
plt.title("Distribution des résidus")
plt.xlabel("Résidus")
plt.ylabel("Fréquence")
plt.savefig("histoResidus.png",dpi=300, bbox_inches="tight")
plt.show()

sm.qqplot(residus,line='s')
plt.title("QQ-plot des résidus")
plt.savefig("QQ-Residus.png",dpi=300, bbox_inches="tight")
plt.show()

#methode 2
from scipy.stats import shapiro
shapiro_test = shapiro(residus) 
print("Statistique de Shapiro-Wilk :", shapiro_test.statistic)
print("p-value :", shapiro_test.pvalue)

Statistique de Shapiro-Wilk : 0.9835276366015211
p-value : 7.061289084592733e-12

# Vérifier qu'il n'y a pas de zéro ou de valeur négative pour appliquer la transformation logarithmique
print((billets_sans_NaN['margin_low'] <= 0).sum())

0

billets_sans_NaN = billets_sans_NaN.copy()
billets_sans_NaN['log_margin_low'] = np.log(billets_sans_NaN['margin_low'])

#Variables explicatives 
X = billets_sans_NaN[['length','margin_up']]
#Variable cible 
y = billets_sans_NaN['log_margin_low']

#création et entraînement du modèle
model= LinearRegression()
model.fit(X,y)

#Lecture des coefficients
#coefficients (b1, b2)
coefs= pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefs)
#Intercept (b0)
print("Intercept:", model.intercept_)

#Vérification de la qualité du model
print("Score R² :", model.score(X,y))

           Coefficient
length       -0.097883
margin_up     0.071936
Intercept: 12.292772567680142
Score R² : 0.44951167035465944

# Calcul des résidus
y_pred_log = model.predict(X)
residus_log = y - y_pred_log  # résidus du modèle avec log(Y)

#Pour la normalité:
# Test de Shapiro
shapiro_log = shapiro(residus_log)
print("Statistique de Shapiro-Wilk :", shapiro_log.statistic)
print("p-value :", shapiro_log.pvalue)

# Histogramme
sns.histplot(residus_log, kde=True,color="blue")
plt.title("Distribution des résidus (Y en log)")
plt.xlabel("Résidus")
plt.ylabel("Fréquence")
plt.savefig("histo2residus.png",dpi=300, bbox_inches="tight")
plt.show()

# QQ-plot
sm.qqplot(residus_log, line='s')
plt.title("QQ-plot des résidus (Y en log)")
plt.savefig("QQ-Residus2.png",dpi=300, bbox_inches="tight")
plt.show()

#Pour l'Homoscedasticité:
from statsmodels.stats.diagnostic import het_breuschpagan

X = sm.add_constant(billets_sans_NaN[['length','margin_up']])  # X : variables explicatives (avec constante)
y = billets_sans_NaN['log_margin_low'] # variable cible

# Modèle de régression linéaire
model = sm.OLS(y, X).fit()

# Test de Breusch–Pagan
bp_test = het_breuschpagan(model.resid, model.model.exog)

# Résultats
labels = ['Statistique LM', 'p-value LM', 'Statistique F', 'p-value F'] 
print(dict(zip(labels, bp_test)))

Statistique de Shapiro-Wilk : 0.9964320268816002
p-value : 0.0018019276108824324

{'Statistique LM': 28.423456686187922, 'p-value LM': 6.728601410461491e-07, 'Statistique F': 14.463587514812993, 'p-value F': 6.02506162387871e-07}

#selection des lignes avec valeurs manquantes 
mask_nan= billets['margin_low'].isna()

#Prédiction des valeurs manquantes en log, puis retour à l’échelle normale
if mask_nan.sum() > 0:
    X_new=billets.loc[mask_nan, ['length', 'margin_up']]
    X_new=sm.add_constant(X_new, has_constant='add')
    
    y_pred_log = model.predict(X_new)
    billets.loc[mask_nan, 'margin_low'] = np.exp(y_pred_log)
    print("Imputation effectuée après transformation logarithmique.")
else:
    print("Aucune valeur manquante à imputer.")

Imputation effectuée après transformation logarithmique.

#Vérification
billets.isna().sum()

is_genuine      0
diagonal        0
height_left     0
height_right    0
margin_low      0
margin_up       0
length          0
dtype: int64

sns.histplot(billets['margin_low'], kde=True, color='blue')
plt.title("Distribution de margin_low après imputation")
plt.xlabel("margin_low (mm)")
plt.ylabel("Fréquence")
plt.savefig("histo_apreslog.png",dpi=300, bbox_inches="tight")
plt.show()

# stats avant/après
print("Avant  | mean, median, std :", billets_sans_NaN['margin_low'].mean(),
      billets_sans_NaN['margin_low'].median(),
      billets_sans_NaN['margin_low'].std())

print("Après  | mean, median, std :", billets['margin_low'].mean(),
      billets['margin_low'].median(),
      billets['margin_low'].std())

Avant  | mean, median, std : 4.485967190704033 4.31 0.6638126241773387
Après  | mean, median, std : 4.483173433424014 4.31 0.6590884493182334

num_cols = billets.select_dtypes(include=[np.number]).columns.tolist()

for col in num_cols:
    fig, ax= plt.subplots(1,2,figsize=(10,3))
    #histogramme + kde 
    sns.histplot(billets[col],kde=True, color="blue", ax=ax[0])
    ax[0].set_title(f"Histogramme - {col}")
    ax[0].set_xlabel(col);
    ax[0].set_ylabel("Fréquence")
    #boxplot
    sns.boxplot(x=billets[col],color="royalblue", ax=ax[1])
    ax[1].set_title(f"Boxplot - {col}")
    ax[1].set_xlabel(col)
    plt.tight_layout()
    plt.show()

fig, ax= plt.subplots(figsize=(8,4+0.3*len(num_cols)))
sns.boxplot(data=billets[num_cols], orient="h", ax=ax)
ax.set_title("Répartition globale des dimensions (boxplots)")
plt.tight_layout()
plt.savefig("Analysedescriptive2.png",dpi=300, bbox_inches="tight")
plt.show()

counts = billets['is_genuine'].value_counts().rename({True:"Vrai", False:"Faux"})
props = (counts/counts.sum()*100).round(1)
display(pd.DataFrame({"Effectif": counts, "Pourcentage (%)": props}))



fig, ax= plt.subplots(figsize=(4,4))
ax.pie( 
    counts,
    labels=[f"{lab} ({p}%)" for lab, p in zip(counts.index, props)],
    startangle=90,
    colors=["royalblue","crimson"],
    wedgeprops={"edgecolor":"white","linewidth":1}
)
ax.set_title("Répartition des billets (Vrai/Faux)")
plt.savefig("Analysedescriptive3.png",dpi=300, bbox_inches="tight")
plt.show()

from sklearn.preprocessing import StandardScaler
vars_box = ['diagonal','height_left','height_right','margin_low','margin_up','length']

# 1) Colonne propre pour la légende
billets_plot = billets.copy()
billets_plot['Classe'] = billets_plot['is_genuine'].map({True: 'Vrai', False: 'Faux'})

# 2) Standardisation (z-score) uniquement des variables numériques
scaler = StandardScaler()
billets_plot[vars_box] = scaler.fit_transform(billets_plot[vars_box])

# 3) Long format pour seaborn
dfm = billets_plot.melt(id_vars='Classe', value_vars=vars_box,
                        var_name='Variable', value_name='Valeur')

# Boxplot 
plt.figure(figsize=(10,6))
ax = sns.boxplot(
    data=dfm,
    x='Variable', y='Valeur',
    hue='Classe',
    hue_order=['Faux','Vrai'],                    # ordre voulu
    palette={'Faux':'crimson','Vrai':'royalblue'} 
)

plt.title("Boxplots après standardisation (Vrai = Bleu / Faux = Rouge)")
plt.xlabel("")
plt.ylabel("Valeurs normalisées (z-score)")

# 5) Légende propre (on réutilise les handles générés pour garder les couleurs)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title="Billet")

plt.tight_layout()
plt.savefig("Analyseexploratoire1.png",dpi=300, bbox_inches="tight")
plt.show()

vars_pair = ['diagonal','height_left','height_right','margin_low','margin_up','length']

sns.pairplot(
    billets[vars_pair + ['is_genuine']],
    hue='is_genuine',
    diag_kind='kde',
    corner=True,
    palette={True: "royalblue", False: "crimson"}
)
plt.suptitle("Relations entre variables (Vrai = Bleu / Faux = Rouge)", y=1.02)
plt.savefig("Analysexploratoire2.png",dpi=300, bbox_inches="tight")
plt.show()

corr=billets.drop(columns='is_genuine').corr() ##on garde seulement les variables numériques
sns.heatmap(corr, annot=True, cmap="Blues", fmt=".2f",vmin= -1, vmax=1, center=0)
plt.title("Matrice de corrélation des dimensions des billets")
plt.show()

g_true  = billets[billets['is_genuine']==True].drop(columns=['is_genuine'])
g_false = billets[billets['is_genuine']==False].drop(columns=['is_genuine'])

fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.heatmap(g_true.corr(numeric_only=True), vmin=-1, vmax=1, cmap="Blues", center=0,fmt=".2f", annot=True, ax=axes[0])
axes[0].set_title("Corrélations — Billets VRAIS")
sns.heatmap(g_false.corr(numeric_only=True), vmin=-1, vmax=1, cmap="Blues", center=0,fmt=".2f", annot=True, ax=axes[1])
axes[1].set_title("Corrélations — Billets FAUX")
plt.tight_layout();
plt.savefig("Analyseexploratoire3.png",dpi=300, bbox_inches="tight")
plt.show()

#corr=billets.drop(columns='is_genuine').corr(method='spearman')

import os 
os.environ["OMP_NUM_THREADS"]="1"  #pour régler la configuration ( corrige la cause technique )
import warnings 
warnings.filterwarnings("ignore", message="Kmeans is known to have a memory leak") #masquer le message d'avertissement 

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA

#Le K-means est un algorithme non supervisé, donc il ne doit pas voir la colonne “vrai/faux” puis standardisation 
X = billets.drop(columns=["is_genuine"])  
X_scaled = scaler.fit_transform(X)

#il faut définir le nombre de cluster k avant de lancer l'algorithme
#plage de k à tester 
k_values = range(2,10)

inertias = []
sil_scores = []

for k in k_values:
    km= KMeans(n_clusters=k, n_init = 20, random_state=0)
    labels = km.fit_predict(X_scaled)
    inertias.append(km.inertia_)                                         #coude
    sil_scores.append(silhouette_score(X_scaled, labels))                #silhouette ([-1,1], + élevé = mieux)

#graph 
fig, axes = plt.subplots(1, 2, figsize=(16,4))

axes[0].plot(list(k_values), inertias, marker='o')
axes[0].set_title("Méthode du coude (Inertie)")
axes[0].set_xlabel("k")
axes[0].set_ylabel("Inertie")
axes[0].grid(True, linestyle=':')

axes[1].plot(list(k_values), sil_scores, marker='o')
axes[1].set_title("Indice de silhouette")
axes[1].set_xlabel("k")
axes[1].set_ylabel("Silhouette")
axes[1].grid(True, linestyle=':')


plt.tight_layout()
plt.savefig("MethodeKM.png",dpi=300, bbox_inches="tight")
plt.show()

#tableau recap 
scores_df = pd.DataFrame({
    "k": list(k_values),
    "inertie": inertias, 
    "silhouette": sil_scores, })
scores_df.head(5)

#Application de k-means 
kmeans =  KMeans(n_clusters=2, random_state=0)
clusters_km = kmeans.fit_predict(X_scaled)

#ajouter à la table 
res_kmeans = billets.copy() 
res_kmeans["Cluster_kmeans"] = clusters_km

#taille des clusters 
print(res_kmeans["Cluster_kmeans"].value_counts())

Cluster_kmeans
0    1003
1     497
Name: count, dtype: int64

pd.crosstab(billets["is_genuine"], res_kmeans["Cluster_kmeans"], rownames=["Vrai/Faux"], colnames=["Cluster"])

# On récupère les vrais labels et les labels du K-means
y_true = billets["is_genuine"]
y_pred = res_kmeans["Cluster_kmeans"]

# Attention : Comme KMeans attribue les numéros de cluster arbitrairement (0 ou 1 peuvent être inversés),
# on vérifie quelle correspondance donne la meilleure précision :
acc1 = accuracy_score(y_true, y_pred)
acc2 = accuracy_score(y_true, 1 - y_pred)
accuracy = max(acc1, acc2)

ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)

print(f"Accuracy (ajustée au bon sens) : {accuracy:.3f}")
print(f"ARI (Adjusted Rand Index) : {ari:.3f}")
print(f"NMI (Normalized Mutual Information) : {nmi:.3f}")

Accuracy (ajustée au bon sens) : 0.985
ARI (Adjusted Rand Index) : 0.939
NMI (Normalized Mutual Information) : 0.877

# ACP sur les données standardisées
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Visualisation selon les clusters trouvés
plt.figure(figsize=(6,5))
plt.scatter(X_pca[clusters_km == 0, 0], X_pca[clusters_km == 0, 1],
            c='blue', s=20, label='Cluster 0')
plt.scatter(X_pca[clusters_km == 1, 0], X_pca[clusters_km == 1, 1],
            c='red', s=20, label='Cluster 1')
plt.xlabel('Composante principale 1')
plt.ylabel('Composante principale 2')
plt.title('Projection PCA des clusters K-means (k=2)')
plt.legend()
plt.grid(True)
plt.savefig("acp1.png",dpi=300, bbox_inches="tight")
plt.show()

 
#Visualisation sur données réelles
plt.figure(figsize=(6,5))
plt.scatter(X_pca[y_true == True, 0], X_pca[y_true == True, 1],
            c='blue', s=20, label='Vrais billets')
plt.scatter(X_pca[y_true == False, 0], X_pca[y_true == False, 1],
            c='red', s=20, label='Faux billets')
plt.xlabel('Composante principale 1')
plt.ylabel('Composante principale 2')
plt.title('Projection PCA selon les étiquettes réelles')
plt.legend()
plt.grid(True)
plt.savefig("acp2.png",dpi=300, bbox_inches="tight")
plt.show()


#cercle de Corrélation
# Récupération des loadings (corrélations entre variables et composantes)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# PC1 et PC2
xs = loadings[:, 0]
ys = loadings[:, 1]

# Création du cercle
plt.figure(figsize=(6,6))
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
circle = plt.Circle((0, 0), 1, color='blue', fill=False, linestyle='--')
plt.gca().add_artist(circle)

# Tracé des flèches pour chaque variable
for i, var in enumerate(billets.drop(columns=['is_genuine']).columns):
    plt.arrow(0, 0, xs[i], ys[i], color='gray', alpha=0.7, head_width=0.03)
    plt.text(xs[i]*1.07, ys[i]*1.07, var, fontsize=9, ha='center', va='center')

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Cercle de corrélation (PC1-PC2)")
plt.xlim(-1.1, 1.1)
plt.ylim(-1.1, 1.1)
plt.grid(False)
plt.savefig("cercleAcp.png",dpi=300, bbox_inches="tight")
plt.show()

# Calcul des loadings (= contributions des variables à chaque axe)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Création d’un DataFrame clair
loadings_df = pd.DataFrame(
    loadings,
    index=billets.drop(columns=["is_genuine"]).columns,  
    columns=[f"PC{i+1}" for i in range(pca.n_components_)]
)

# Affichage : top variables par composante
for col in loadings_df.columns[:2]:  # on limite aux 2 premières composantes
    print(f"\n Top contributions pour {col} :")
    top_vars = (
        loadings_df[col]
        .abs()
        .sort_values(ascending=False)
        .head(8)
    )
    display(top_vars)

for i in range(2):  # PC1 et PC2
    loadings_df[f"PC{i+1}"].abs().sort_values(ascending=False).head(10).plot.bar(figsize=(6,3))
    plt.title(f"Top 10 variables contributrices à PC{i+1}")
    plt.ylabel("Importance (|loading|)")
    plt.show()

 Top contributions pour PC1 :

length          0.850266
margin_low      0.817110
margin_up       0.709068
height_right    0.633099
height_left     0.532577
diagonal        0.135821
Name: PC1, dtype: float64

 Top contributions pour PC2 :

diagonal        0.949650
height_left     0.310367
height_right    0.109732
margin_low      0.072820
length          0.049624
margin_up       0.005766
Name: PC2, dtype: float64

# Récupération des centroïdes du modèle (normalisé puis avec valeurs remise à l'échelle d'origine)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
display(centroids)

profils_km= res_kmeans.groupby("Cluster_kmeans").mean().round(2)
display(profils_km)



# Comparaison avec les moyennes observées selon la nature du billet
moyennes_reelles = billets.groupby('is_genuine')[['diagonal','height_left','height_right','margin_low','margin_up','length']].mean()
display(moyennes_reelles)

# Importation des modules nécessaires
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Copie du DataFrame
df_Rlogistique = billets.copy()

# Convertion en 0/1
# ici, 0 = vrai billet ("True"), 1 = faux billet ("False")
df_Rlogistique['is_genuine'] = df_Rlogistique['is_genuine'].map({True: 0, False: 1})

#Verification
df_Rlogistique['is_genuine']

0       0
1       0
2       0
3       0
4       0
       ..
1495    1
1496    1
1497    1
1498    1
1499    1
Name: is_genuine, Length: 1500, dtype: int64

#variables explicatives et variable cible 
feature_cols = [ 'diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up','length']
target_col = 'is_genuine'   # 0 = vrai, 1 = faux

X = df_Rlogistique[feature_cols]
y = df_Rlogistique[target_col]

# Séparation du jeu de données :
# - X_train, y_train : pour entraîner le modèle (apprentissage)
# - X_test, y_test   : pour évaluer le modèle sur des données jamais vues
# On utilise stratify=y pour garder la même proportion de vrais/faux billets.

# Découpage du jeu de données
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20 % des données pour le test
    stratify=y,           # garde la même proportion de vrais/faux billets
    random_state=0       # pour que ce soit reproductible
)

# Vérification
print("Taille jeu d'entraînement :", X_train.shape)
print("Taille jeu de test :", X_test.shape)

Taille jeu d'entraînement : (1200, 6)
Taille jeu de test : (300, 6)

# Création du pipeline :
# Le pipeline enchaîne automatiquement plusieurs étapes :
# 1. Imputation : remplace les valeurs manquantes (sécurité, même s'il n'y en a pas actuellement)
# 2. Standardisation : met toutes les variables sur la même échelle
# 3. Régression logistique : apprentissage du modèle
# Cela garantit une exécution propre, cohérente et reproductible.


# Création du pipeline
pipeline_logistic = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # 1️) gère les valeurs manquantes
    ('scaler', StandardScaler()),                   # 2️) met toutes les variables sur la même échelle
    ('model', LogisticRegression(                   # 3️) crée le modèle de régression logistique
        max_iter=2000,            # pour être sûr que le modèle converge 
        class_weight='balanced',  # utile car donne la même importance aux vrais et aux faux billets, même s’ils ne sont pas aussi nombreux
        random_state=0            
    ))
])

# Entraînement du modèle sur le jeu d'entraînement
pipeline_logistic.fit(X_train, y_train) 
print("Modèle de régression logistique entraîné avec succès")

Modèle de régression logistique entraîné avec succès

from sklearn.metrics import accuracy_score, confusion_matrix

# Prédictions sur le jeu de test
y_pred = pipeline_logistic.predict(X_test)

# 1️) Accuracy : pourcentage de bonnes prédictions
acc = accuracy_score(y_test, y_pred)

# 2️) Matrice de confusion : montre les erreurs et bonnes réponses
cm = confusion_matrix(y_test, y_pred)

print("Taux de bonne classification (accuracy) :", round(acc, 3))
print("\nMatrice de confusion :\n", cm)


# Création d'une figure
plt.figure(figsize=(5,4))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Prédit: Vrai', 'Prédit: Faux'],
            yticklabels=['Réel: Vrai', 'Réel: Faux'])

plt.title("Matrice de confusion – Régression logistique")
plt.ylabel("Valeur réelle")
plt.xlabel("Valeur prédite")

plt.savefig("matriceConfusionRL.png",dpi=300, bbox_inches="tight")
plt.show()

Taux de bonne classification (accuracy) : 0.987

Matrice de confusion :
 [[200   0]
 [  4  96]]

# 1) Récupérer le modèle logistique entraîné à l'intérieur du pipeline
clf = pipeline_logistic.named_steps['model']   # 'model' = le nom que nous avons donné dans le Pipeline

# 2) Coefficients (β) associés aux features
betas = pd.Series(clf.coef_.ravel(), index=X_train.columns)

# 3) Petite table lisible : coef, importance (|coef|), odds ratio = exp(coef)
coef_table = (
    pd.DataFrame({
        'coef_beta': betas,
        'importance_abs': betas.abs(),
        'odds_ratio': np.exp(betas)
    })
    .sort_values('importance_abs', ascending=False)
)

# 4) Intercept (β0)
beta0 = float(clf.intercept_[0])

print("Intercept (β0):", round(beta0, 4))
coef_table

Intercept (β0): -1.6966

# Trier les coefficients du plus négatif au plus positif
coef_sorted = coef_table.sort_values("coef_beta")

plt.figure(figsize=(8, 5))
plt.barh(coef_sorted.index, coef_sorted["coef_beta"], color=["Royalblue" if c < 0 else "red" for c in coef_sorted["coef_beta"]])
plt.axvline(0, color='gray', linewidth=1)

plt.title("Coefficients de la régression logistique", fontsize=14)
plt.xlabel("Valeur du coefficient β (effet sur la probabilité d'être Faux)", fontsize=11)
plt.ylabel("Variables explicatives", fontsize=11)
plt.grid(axis='x', linestyle='--', alpha=0.5)


plt.savefig("coeffRL.png",dpi=300, bbox_inches="tight")
plt.show()

from sklearn.metrics import roc_curve, roc_auc_score

# Probabilité de la classe positive = FAUX (colonne 1)
y_proba_faux = pipeline_logistic.predict_proba(X_test)[:, 1]

# Courbe ROC (positive = FAUX)
fpr, tpr, thresholds = roc_curve(y_test, y_proba_faux)   # y_test: 1=FAUX, 0=VRAI
auc = roc_auc_score(y_test, y_proba_faux)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'ROC (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Hasard (0.5)')
plt.xlabel('Taux de faux positifs (Proportion de vrais billets accusés à tort )')
plt.ylabel('Taux de vrais positifs (Proportion de faux billets détectés)')
plt.title("Courbe ROC – Classe positive = FAUX")
plt.legend()
plt.grid(alpha=0.3)

plt.savefig("rocAucRL.png",dpi=300, bbox_inches="tight")
plt.show()

print("AUC (classe positive = FAUX) :", round(auc, 3))

AUC (classe positive = FAUX) : 1.0

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_proba_faux)
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
    print(f"Seuil : {thresholds[i]:.2f} --> FPR={fpr[i]:.2f}, TPR={tpr[i]:.2f}")

Seuil : inf --> FPR=0.00, TPR=0.00
Seuil : 1.00 --> FPR=0.00, TPR=0.01
Seuil : 0.23 --> FPR=0.00, TPR=0.99
Seuil : 0.08 --> FPR=0.04, TPR=0.99
Seuil : 0.08 --> FPR=0.04, TPR=1.00
Seuil : 0.00 --> FPR=1.00, TPR=1.00

from sklearn.metrics import f1_score

seuils = np.arange(0, 1.01, 0.01) #np.arange(début,fin,pas) soit 101 valeurs de seuils à tester
f1_scores = []

for s in seuils:
    y_pred_temp = (y_proba_faux >= s).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_temp, pos_label=1))

best_seuil = seuils[np.argmax(f1_scores)]
best_f1 = max(f1_scores)

print(f"Seuil optimal (F1) : {best_seuil:.2f}, F1-score : {best_f1:.3f}")

Seuil optimal (F1) : 0.22, F1-score : 0.995

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# 1️) Récupération des probabilités prédites pour la classe FAUX (1)
y_proba_faux = pipeline_logistic.predict_proba(X_test)[:, 1]

# 2️) Application du seuil optimal
seuil_optimal = 0.22
y_pred_seuil = (y_proba_faux >= seuil_optimal).astype(int)

# 3️) Matrice de confusion et score
cm = confusion_matrix(y_test, y_pred_seuil)
acc = accuracy_score(y_test, y_pred_seuil)

print(f"Taux de bonne classification (accuracy) : {acc:.3f}")
print("Matrice de confusion :\n", cm)

# 4️) Visualisation
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Prédit : Vrai', 'Prédit : Faux'],
            yticklabels=['Réel : Vrai', 'Réel : Faux'])
plt.title("Matrice de confusion – Seuil 0.22 (classe FAUX)")
plt.ylabel("Valeur réelle")
plt.xlabel("Valeur prédite")

plt.savefig("matriceconfRLOptimisation.png",dpi=300, bbox_inches="tight")
plt.show()

# 5️) Rapport complémentaire
print("\nRapport de classification :\n")
print(classification_report(y_test, y_pred_seuil, target_names=['Vrai', 'Faux']))

Taux de bonne classification (accuracy) : 0.997
Matrice de confusion :
 [[200   0]
 [  1  99]]

Rapport de classification :

              precision    recall  f1-score   support

        Vrai       1.00      1.00      1.00       200
        Faux       1.00      0.99      0.99       100

    accuracy                           1.00       300
   macro avg       1.00      0.99      1.00       300
weighted avg       1.00      1.00      1.00       300

# 1) Recréer X et y proprement
feature_cols = [ 'diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up','length']
target_col = 'is_genuine'   # 0 = vrai, 1 = faux

X = df_Rlogistique[feature_cols]
y = df_Rlogistique[target_col]

# 2) Test de robustesse (10 splits)

accuracies = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=i
    )
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(max_iter=2000, class_weight='balanced', random_state=0))
    ])
    pipeline.fit(X_train, y_train)
    accuracies.append(pipeline.score(X_test, y_test))

print("Accuracy moyenne :", round(np.mean(accuracies), 3))
print("Écart-type        :", round(np.std(accuracies), 4))

Accuracy moyenne : 0.988
Écart-type        : 0.0052

#importation et affichage
billets_production = pd.read_csv('billets_production.csv')
billets_production

#pour avoir l'ordre des features
expected_features = list(X_train.columns)  
expected_features

['diagonal',
 'height_left',
 'height_right',
 'margin_low',
 'margin_up',
 'length']

import os, json, joblib, sklearn
os.makedirs("artifacts", exist_ok=True)

joblib.dump(pipeline_logistic, "artifacts/model.joblib")

config = {
    "threshold": 0.22, #seuil à 0.22
    "expected_features": expected_features,
    "sklearn_version": sklearn.__version__,
    "random_state": 0,
    "notes": "pipeline: imputer(median) pour NaN -> scaler -> logistic_regression(class_weight=balanced)"
}
with open("artifacts/config.json", "w", encoding="utf-8") as f:
    json.dump(config, f, ensure_ascii=False, indent=2)

print("Sauvegardé: artifacts/model.joblib + artifacts/config.json")

Sauvegardé: artifacts/model.joblib + artifacts/config.json

def predict_file(path_csv, id_col=None):
    # Charger artefacts
    pipe = joblib.load("artifacts/model.joblib")
    with open("artifacts/config.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)

    thr = cfg["threshold"]
    expected = cfg["expected_features"]


    
    # Lire données
    df = pd.read_csv(path_csv)                                                                                               # ,sep=";")

    # Vérifs de schéma
    missing = [c for c in expected if c not in df.columns]
    extra   = [c for c in df.columns if c not in expected and c != id_col]
    if missing:
        print(f"Attention! Colonnes manquantes: {missing} (remplies par NaN → imputées par la pipeline)")
    if extra:
        print(f"Infos! Colonnes ignorées: {extra}")

    # Aligner l’ordre des colonnes attendues
    X = df.reindex(columns=expected)


    
    # Probas + seuil
    proba = pipe.predict_proba(X)[:, 1]
    label = (proba >= thr).astype(int)

    out = pd.DataFrame({"proba_pos": proba, f"label_{thr}": label})
    if id_col and id_col in df.columns:
        out.insert(0, id_col, df[id_col].values)
    return out

os.makedirs("outputs", exist_ok=True)

preds = predict_file("billets_test (4).csv", id_col="id") #prédiction

mapping = {0: "vrai_billet", 1: "faux_billet"}  #affichage des prédictions textuellement 
preds["prediction_text"] = preds["label_0.22"].map(mapping)

display(preds.head())  # aperçu résultat
preds.to_csv("outputs/predictions_production.csv", index=False) # téléchargement résultat

print("Fini. Résultats dans outputs/predictions_production.csv")

Fini. Résultats dans outputs/predictions_production.csv

# Rappel du seuil
seuil = 0.22

# Définition d'une zone d'incertitude autour du seuil (ici ±0.02, donc entre 0.20 et 0.24)
marge = 0.02

# Sélection des billets dans la zone d'incertitude
zone_incertaine = preds[(preds["proba_pos"] >= seuil - marge) & (preds["proba_pos"] <= seuil + marge)]

# Affichage
print("Billets dans la zone d'incertitude :")
display(zone_incertaine)

#téléchargement
zone_incertaine.to_csv("outputs/billets_zone_incertaine.csv", index=False)
print("Fichier Annexe. Résultats dans outputs/billets_zone_incertaine.csv")

Billets dans la zone d'incertitude :

Fichier Annexe. Résultats dans outputs/billets_zone_incertaine.csv

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
0	True	171.81	104.86	104.95	4.52	2.89	112.83
1	True	171.46	103.36	103.66	3.77	2.99	113.09
2	True	172.69	104.48	103.50	4.40	2.94	113.16
3	True	171.36	103.91	103.94	3.62	3.01	113.51
4	True	171.73	104.28	103.46	4.04	3.48	112.54

	count	mean	std	min	25%	50%	75%	max
diagonal	1500.0	171.958440	0.305195	171.04	171.750	171.96	172.17	173.01
height_left	1500.0	104.029533	0.299462	103.14	103.820	104.04	104.23	104.88
height_right	1500.0	103.920307	0.325627	102.82	103.710	103.92	104.15	104.95
margin_low	1463.0	4.485967	0.663813	2.98	4.015	4.31	4.87	6.90
margin_up	1500.0	3.151473	0.231813	2.27	2.990	3.14	3.31	3.91
length	1500.0	112.678500	0.872730	109.49	112.030	112.96	113.34	114.44

	k	inertie	silhouette
0	2	5807.303515	0.342399
1	3	5093.394746	0.210272
2	4	4677.506603	0.196132
3	5	4361.532560	0.156490
4	6	4109.013540	0.149951

	diagonal	height_left	height_right	margin_low	margin_up	length
0	0.096124	-0.281762	-0.352065	-0.552229	-0.428505	0.594584
1	-0.193988	0.568627	0.710506	1.114458	0.864769	-1.199935

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
Cluster_kmeans
0	0.99	171.99	103.95	103.81	4.12	3.05	113.20
1	0.02	171.90	104.20	104.15	5.22	3.35	111.63

Projet 10 : Détecter des faux billets avec Python

Contexte :

Objectif :

6 caractéristiques géometriques d'un billet :

Partie 1 : Importation, Nettoyage et Analyse exploratoire

Importation :

Nettoyage :

Aperçu initial : comprendre ce que contient le jeu de donnée¶

Analyse et traitement des valeurs manquantes : quantifier les NaN, imputer les valeurs manquantes¶

Le jeu de données étant désormais propre et complet, nous pouvons entamer la phase d'analyse exploratoire afin d'approfondir l'étude des caractéristiques des billets¶

Analyse exploratoire :

Après avoir décrit et exploré les données, nous pouvons passer à l’étape suivante qui consiste à construire et évaluer les modèles de détection automatique des faux billets.¶

Partie 2 : Construction de l'algorithme

K-Means :

Pour affiner la séparation et apprendre une frontière de décision plus précise, il est donc pertinent de passer à une approche supervisée, comme la régression logistique.¶

Régression logistique classique :

La prochaine étape consiste à sauvegarder le modèle final puis à l’appliquer sur un nouveau fichier (fichiers_productions.csv) afin de tester son comportement en conditions réelles.¶

Partie 3 : Modèle final

Sauvegarde du modèle final + configuration (seuil 0.22) :

Fonction pour prédire un fichier csv :

Test de l'algorithme avec un nouveau jeu de données :

	diagonal	height_left	height_right	margin_low	margin_up	length
is_genuine
False	171.90116	104.19034	104.14362	5.21270	3.35016	111.63064
True	171.98708	103.94913	103.80865	4.11841	3.05213	113.20243

	coef_beta	importance_abs	odds_ratio
length	-3.331755	3.331755	0.035730
margin_low	2.692955	2.692955	14.775272
margin_up	1.937791	1.937791	6.943399
height_right	0.744308	0.744308	2.104985
height_left	0.525643	0.525643	1.691546
diagonal	-0.192549	0.192549	0.824854

	diagonal	height_left	height_right	margin_low	margin_up	length	id
0	171.76	104.01	103.54	5.21	3.30	111.42	A_1
1	171.87	104.17	104.13	6.00	3.31	112.09	A_2
2	172.00	104.58	104.29	4.99	3.39	111.57	A_3
3	172.49	104.55	104.34	4.44	3.03	113.20	A_4
4	171.65	103.63	103.56	3.77	3.16	113.33	A_5

	id	proba_pos	label_0.22	prediction_text
0	B_1	0.005538	0	vrai_billet
1	B_2	0.996194	1	faux_billet
2	B_3	0.000850	0	vrai_billet
3	B_4	0.999959	1	faux_billet
4	B_5	0.988443	1	faux_billet

	Effectif	Pourcentage (%)
is_genuine
Vrai	1000	66.7
Faux	500	33.3

Cluster	0	1
Vrai/Faux
False	13	487
True	990	10