import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

from pathlib import Path


pd.set_option("display.max_columns",200)
pd.set_option("display.max_rows",200)

#Chargement du fichier csv 

DATA_PATH = Path("..")/"data"/"raw"
heart = pd.read_csv(DATA_PATH/"heart.csv")

#check rapide du dataset : taille, typage, NaN 
def quick_profile(df, name):
    print(f"\n=== {name} ===")
    print("shape:", df.shape)
    print("\nDtypes:\n", df.dtypes)
    na = (df.isna().mean().sort_values(ascending=False) * 100).round(1)
    print("\n% NA (top 15):\n", na.head(15))

quick_profile(heart, "HEART")

=== HEART ===
shape: (918, 12)

Dtypes:
 Age                 int64
Sex                   str
ChestPainType         str
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG            str
MaxHR               int64
ExerciseAngina        str
Oldpeak           float64
ST_Slope              str
HeartDisease        int64
dtype: object

% NA (top 15):
 Age               0.0
Sex               0.0
ChestPainType     0.0
RestingBP         0.0
Cholesterol       0.0
FastingBS         0.0
RestingECG        0.0
MaxHR             0.0
ExerciseAngina    0.0
Oldpeak           0.0
ST_Slope          0.0
HeartDisease      0.0
dtype: float64

heart.duplicated().sum()

np.int64(0)

heart.head()

categorical_cols = heart.select_dtypes(include="object").columns

for col in categorical_cols:
    print(f"\n--- {col} ---")
    print(heart[col].value_counts())

--- Sex ---
Sex
M    725
F    193
Name: count, dtype: int64

--- ChestPainType ---
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

--- RestingECG ---
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64

--- ExerciseAngina ---
ExerciseAngina
N    547
Y    371
Name: count, dtype: int64

--- ST_Slope ---
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64

C:\Users\bouzo\AppData\Local\Temp\ipykernel_16836\4107697228.py:1: Pandas4Warning: For backward compatibility, 'str' dtypes are included by select_dtypes when 'object' dtype is specified. This behavior is deprecated and will be removed in a future version. Explicitly pass 'str' to `include` to select them, or to `exclude` to remove them and silence this warning.
See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = heart.select_dtypes(include="object").columns

heart.describe()

heart.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

heart['RestingBP'] = heart['RestingBP'].replace(0, np.nan)
heart['Cholesterol'] = heart['Cholesterol'].replace(0, np.nan)

heart.isnull().sum()

Age                 0
Sex                 0
ChestPainType       0
RestingBP           1
Cholesterol       172
FastingBS           0
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak             0
ST_Slope            0
HeartDisease        0
dtype: int64

heart['RestingBP'] = heart['RestingBP'].fillna(heart['RestingBP'].median())

#création d'un flag
heart['Cholesterol_missing'] = heart['Cholesterol'].isnull().astype(int)

#imputation
heart['Cholesterol'] = heart['Cholesterol'].fillna(heart['Cholesterol'].median())

heart['HeartDisease'].value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

heart['HeartDisease'].value_counts(normalize=True)

HeartDisease
1    0.553377
0    0.446623
Name: proportion, dtype: float64

sns.countplot(data=heart, x='HeartDisease')
plt.title("Distribution de la variable cible")
plt.show()

heart.hist(figsize=(12,10))

array([[<Axes: title={'center': 'Age'}>,
        <Axes: title={'center': 'RestingBP'}>,
        <Axes: title={'center': 'Cholesterol'}>],
       [<Axes: title={'center': 'FastingBS'}>,
        <Axes: title={'center': 'MaxHR'}>,
        <Axes: title={'center': 'Oldpeak'}>],
       [<Axes: title={'center': 'HeartDisease'}>,
        <Axes: title={'center': 'Cholesterol_missing'}>, <Axes: >]],
      dtype=object)

for col in ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']:
    sns.boxplot(x=heart[col])
    plt.title(col)
    plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(heart.corr(numeric_only=True), annot=True, cmap='coolwarm')

<Axes: >

for col in categorical_cols:
    sns.countplot(data=heart,x=col)
    plt.xticks(rotation=10)
    plt.show()

num_cols = heart.select_dtypes(include=['int64', 'float64']).columns

num_continue_cols = [col for col in num_cols 
                     if heart[col].nunique() > 2 and col != "HeartDisease"]

for col in num_continue_cols:
    sns.boxplot(data=heart, x="HeartDisease", y=col)
    plt.title(col)
    plt.show()

sns.countplot(x="FastingBS", hue="HeartDisease", data=heart)

<Axes: xlabel='FastingBS', ylabel='count'>

sns.countplot(x="Cholesterol_missing", hue="HeartDisease", data=heart)

<Axes: xlabel='Cholesterol_missing', ylabel='count'>

for col in categorical_cols:
    sns.countplot(data=heart, x=col, hue="HeartDisease")
    plt.title(col)
    plt.xticks(rotation=15)
    plt.show()

heart.to_csv("../data/processed/heart_cleaning.csv", index=False)

	Age	RestingBP	Cholesterol	FastingBS	MaxHR	Oldpeak	HeartDisease
count	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000
mean	53.510893	132.396514	198.799564	0.233115	136.809368	0.887364	0.553377
std	9.432617	18.514154	109.384145	0.423046	25.460334	1.066570	0.497414
min	28.000000	0.000000	0.000000	0.000000	60.000000	-2.600000	0.000000
25%	47.000000	120.000000	173.250000	0.000000	120.000000	0.000000	0.000000
50%	54.000000	130.000000	223.000000	0.000000	138.000000	0.600000	1.000000
75%	60.000000	140.000000	267.000000	0.000000	156.000000	1.500000	1.000000
max	77.000000	200.000000	603.000000	1.000000	202.000000	6.200000	1.000000

Projet : Système d'aide au dépistage précoce du risque de maladie cardiaque

Notebook 1 : EDA

Section 1 : Importation et compréhension des donnnées

Section 2 : Nettoyage, Préparation et Analyse des données (EDA)

Analyse de la variable target¶

Analyse des variables numériques¶

Analyse des variables catégorielles¶

Relation avec la target¶

A - Variables numériques vs TARGET¶

B - Variables catégorielles vs TARGET¶

Exportation

Préparation à la modélisation (encodage, standardisation...)¶

	Age	Sex	ChestPainType	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
0	40	M	ATA	140	289	Normal	172	N	0.0	Up	0
1	49	F	NAP	160	180	Normal	156	N	1.0	Flat	1
2	37	M	ATA	130	283	ST	98	N	0.0	Up	0
3	48	F	ASY	138	214	Normal	108	Y	1.5	Flat	1
4	54	M	NAP	150	195	Normal	122	N	0.0	Up	0