import pandas as pd
import numpy as np

from sqlalchemy import create_engine

engine = create_engine(
    "postgresql://postgres:postgres@localhost:5432/taxi"
)

query = """
SELECT *
FROM mart.mart_demand_hourly;
"""

df = pd.read_sql(query, engine)

df.head()

df.shape

(2819948, 7)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2819948 entries, 0 to 2819947
Data columns (total 7 columns):
 #   Column             Dtype         
---  ------             -----         
 0   hour_ts            datetime64[ns]
 1   pu_location_id     int64         
 2   trips              int64         
 3   avg_trip_distance  float64       
 4   avg_total_amount   float64       
 5   pct_cash           float64       
 6   pct_card           float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 150.6 MB

df['hour_ts'].min(), df['hour_ts'].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2024-12-31 23:00:00'))

zones = pd.read_sql("""
SELECT
    location_id AS pu_location_id,
    borough,
    zone
FROM curated.dim_taxi_zone;
""", engine)

zones.head()

df = df.merge(
    zones,
    on="pu_location_id",
    how="left"
)

df.head()

df['borough'].isna().mean()

0.0

df.isna().mean().sort_values(ascending=False)

avg_total_amount     0.000077
hour_ts              0.000000
pu_location_id       0.000000
trips                0.000000
avg_trip_distance    0.000000
pct_cash             0.000000
pct_card             0.000000
borough              0.000000
zone                 0.000000
dtype: float64

df['trips'].describe()

count    2.819948e+06
mean     4.224735e+01
std      7.519024e+01
min      1.000000e+00
25%      1.000000e+00
50%      5.000000e+00
75%      5.000000e+01
max      1.239000e+03
Name: trips, dtype: float64

import matplotlib.pyplot as plt
import seaborn as sns 
plt.figure(figsize=(8,4))
sns.boxplot(x=df['trips'],color='gold')
plt.title("Distribution of hourly taxi trips per zone")
plt.show()

df.sort_values('trips', ascending=False).head(10)

df['pct_sum'] = df['pct_cash'] + df['pct_card']

df['pct_sum'].describe()

count    2.819948e+06
mean     8.178977e+01
std      3.035457e+01
min      0.000000e+00
25%      8.000000e+01
50%      9.628099e+01
75%      1.000000e+02
max      1.000000e+02
Name: pct_sum, dtype: float64

plt.figure(figsize=(8,4))
sns.histplot(df['pct_sum'], bins=50, color='gold')
plt.title("Distribution of pct_cash + pct_card")
plt.show()

df['pct_sum'].between(0, 100).all()

True

df.duplicated(subset=['hour_ts', 'pu_location_id']).sum()

0

df = df.sort_values(['hour_ts', 'pu_location_id']).reset_index(drop=True)

df.head()

df['hour'] = df['hour_ts'].dt.hour
df['weekday'] = df['hour_ts'].dt.weekday  # 0 = Monday
df['is_weekend'] = df['weekday'].isin([5, 6])
df['month'] = df['hour_ts'].dt.month
df['year'] = df['hour_ts'].dt.year

df[['hour_ts', 'hour', 'weekday', 'is_weekend', 'month', 'year']].head()

df['trips'].describe()

count    2.819948e+06
mean     4.224735e+01
std      7.519024e+01
min      1.000000e+00
25%      1.000000e+00
50%      5.000000e+00
75%      5.000000e+01
max      1.239000e+03
Name: trips, dtype: float64

plt.figure(figsize=(8,4))
sns.histplot(df['trips'], bins=50, color='gold')
plt.title("Distribution of Hourly Taxi Demand per Zone")
plt.xlabel("Number of trips")
plt.show()

hourly_pattern = (
    df.groupby('hour')['trips']
      .mean()
      .reset_index()
)

plt.figure(figsize=(10,5))
sns.lineplot(data=hourly_pattern, x='hour', y='trips', marker='o',color='gold')
plt.title("Average Taxi Demand by Hour of Day")
plt.xlabel("Hour of day")
plt.ylabel("Average number of trips")


plt.savefig("averageTaxiDemandByHour.png",dpi=300, bbox_inches="tight")
plt.show()

week_pattern = (
    df.groupby('is_weekend')['trips']
      .mean()
      .reset_index()
)

week_pattern['is_weekend'] = week_pattern['is_weekend'].map(
    {False: 'Weekday', True: 'Weekend'}
)

sns.barplot(data=week_pattern, x='is_weekend', y='trips', color='gold')
plt.title("Average Taxi Demand: Weekday vs Weekend")
plt.ylabel("Average number of trips")
plt.show()

borough_demand = (
    df.groupby('borough')['trips']
      .mean()
      .sort_values(ascending=False)
      .reset_index()
)

plt.figure(figsize=(8,5))
sns.barplot(data=borough_demand, x='trips', y='borough', color='gold')
plt.title("Average Taxi Demand by Borough")
plt.xlabel("Average number of trips")
plt.show()

top_zones = (
    df.groupby('zone')['trips']
      .mean()
      .sort_values(ascending=False)
      .head(15)
      .reset_index()
)

plt.figure(figsize=(8,6))
sns.barplot(data=top_zones, x='trips', y='zone', color='gold')
plt.title("Top 15 Pickup Zones by Average Demand")
plt.xlabel("Average number of trips")
plt.show()

daily_demand = (
    df.groupby(df['hour_ts'].dt.date)['trips']
      .sum()
      .reset_index()
)

plt.figure(figsize=(12,5))
plt.plot(daily_demand['hour_ts'], daily_demand['trips'],color='gold')
plt.title("Daily Taxi Demand Over Time (2022–2024)")
plt.xlabel("Date")
plt.ylabel("Total number of trips")
plt.show()

target = 'trips'

#base features :  hour, weekday, is_weekend, month, year, borough, zone

df = df.sort_values(["zone", "hour_ts"])

for lag in [1, 24, 168]:
    df[f"trips_lag_{lag}h"] = (
        df.groupby("zone")["trips"]
          .shift(lag)
    )

#check 

df[
    ["zone", "hour_ts", "trips", "trips_lag_1h", "trips_lag_24h", "trips_lag_168h"]
]

#les NaN ne sont pas des erreurs , ça veut juste dire "pas assez d'historique" on les supprimera plus tard (4.4)

df["trips_roll_mean_24h"] = (
    df.groupby("zone")["trips"]
      .rolling(window=24)
      .mean()
      .reset_index(level=0, drop=True)
)

df["trips_roll_mean_168h"] = (
    df.groupby("zone")["trips"]
      .rolling(window=168)
      .mean()
      .reset_index(level=0, drop=True)
)

#check 

df[
    [
        "zone",
        "hour_ts",
        "trips",
        "trips_roll_mean_24h",
        "trips_roll_mean_168h"
    ]
]

#lag = mémoire ponctuelle 
#rolling mean = tendance 
# les 2 sont complémentaires , jamais redondants 

# résultat parfaitement cohérent métier :
#moyenne 24h > moyenne 168h → activité récente plus forte
#rolling mean < valeur instantanée → pic horaire

feature_cols = [
    "hour",
    "weekday",
    "is_weekend",
    "month",
    "trips_lag_1h",
    "trips_lag_24h",
    "trips_lag_168h",
    "trips_roll_mean_24h",
    "trips_roll_mean_168h"
]

df_model = df.dropna(subset=feature_cols + ["trips"])

#check 
df_model.isna().sum()

hour_ts                   0
pu_location_id            0
trips                     0
avg_trip_distance         0
avg_total_amount        213
pct_cash                  0
pct_card                  0
borough                   0
zone                      0
pct_sum                   0
hour                      0
weekday                   0
is_weekend                0
month                     0
year                      0
trips_lag_1h              0
trips_lag_24h             0
trips_lag_168h            0
trips_roll_mean_24h       0
trips_roll_mean_168h      0
dtype: int64

#check 
df_model.shape

(2777118, 20)

df_model = pd.get_dummies(
    df_model,
    columns=["borough"],
    drop_first=False # je pourrai mettre en True pour retirer la colonne de reference car le model n'en a pas besoin en vrai
)

#check 
df_model.filter(like="borough_")

#target 
y = df_model["trips"]

#features 

feature_cols_final = [
    # temporal features
    "hour",
    "weekday",
    "is_weekend",
    "month",

    # lag features
    "trips_lag_1h",
    "trips_lag_24h",
    "trips_lag_168h",

    # rolling statistics
    "trips_roll_mean_24h",
    "trips_roll_mean_168h",
]

# ajouter les colonnes borough encodées
feature_cols_final += [
    col for col in df_model.columns if col.startswith("borough_")
]

#Dataset final ML
X = df_model[feature_cols_final]

#check 
X.shape, y.shape

((2777118, 17), (2777118,))

#check
X.isna().sum().sum()

0

#check 
X.head()

# Define split date (start of test period)
split_date = pd.Timestamp("2024-01-01")

# Boolean masks
train_mask = df_model["hour_ts"] < split_date
test_mask  = df_model["hour_ts"] >= split_date

# Split
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

#check 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1716184, 17), (1060934, 17), (1716184,), (1060934,))

#sanity check 
df_model.loc[train_mask, "hour_ts"].min(), df_model.loc[train_mask, "hour_ts"].max()

(Timestamp('2022-01-08 00:00:00'), Timestamp('2023-12-31 23:00:00'))

#sanity check
df_model.loc[test_mask, "hour_ts"].min(), df_model.loc[test_mask, "hour_ts"].max()

(Timestamp('2024-01-01 00:00:00'), Timestamp('2024-12-31 23:00:00'))

# Baseline prediction: persistence model
y_pred_baseline = X_test["trips_lag_1h"]

#The baseline model is evaluated on the test set only, as performance must be assessed on unseen future data to reflect real-world forecasting conditions.
#Le modèle de référence est évalué uniquement sur le jeu de test, car la performance doit être mesurée sur des données futures non vues afin de refléter un scénario réel de prévision.

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

mae_baseline, rmse_baseline

(9.894434526558674, 23.051931697546312)

y_pred_baseline.isna().sum() # si 0 = pipeline propre

0

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=20, n_jobs=-1, random_state=42)

RandomForestRegressor(max_depth=20, n_jobs=-1, random_state=42)

y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

mae_rf, rmse_rf

(6.356378538826315, 14.664632361972222)

comparison_df = pd.DataFrame({
    "Model": ["Baseline (Lag-1)", "Random Forest"],
    "MAE": [mae_baseline, mae_rf],
    "RMSE": [rmse_baseline, rmse_rf]
})

comparison_df.round(2)

feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

feature_importance.head(17)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance["feature"].head(17)[::-1],
    feature_importance["importance"].head(17)[::-1], 
    color='gold'
)

for i, v in enumerate(feature_importance["importance"].head(17)[::-1]):
    plt.text(v, i, f"{v:.3f}", va="center")

plt.title("Top 15 Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

#création d'une table d'analyse des erreurs 
#un résidu = réalité - prédiction 

# Residuals
df_errors = X_test.copy()
df_errors["y_true"] = y_test.values
df_errors["y_pred"] = y_pred_rf
df_errors["error"] = df_errors["y_true"] - df_errors["y_pred"]
df_errors["abs_error"] = df_errors["error"].abs()

df_errors.head()

plt.figure(figsize=(10, 5))
plt.hist(df_errors["abs_error"], bins=100, color='gold')
plt.title("Distribution of Absolute Prediction Errors")
plt.xlabel("Absolute Error (|y_true - y_pred|)")
plt.ylabel("Frequency")
plt.tight_layout()

plt.show()

plt.figure(figsize=(10, 5))
plt.scatter(
    df_errors["y_true"],
    df_errors["abs_error"],
    alpha=0.05,
    color='black'
)
plt.xlabel("Observed trips")
plt.ylabel("Absolute error")
plt.title("Error vs Observed Demand")
plt.tight_layout()

plt.axhline(df_errors["abs_error"].median(), linestyle="--", color="gold")

plt.show()

error_by_hour = df_errors.groupby("hour")["abs_error"].mean()

plt.figure(figsize=(10, 5))
error_by_hour.plot(color='gold')
plt.title("Mean Absolute Error by Hour of Day")
plt.xlabel("Hour of day")
plt.ylabel("Mean Absolute Error")
plt.grid(True)
plt.tight_layout()

plt.axvspan(7, 9, alpha=0.15, label="Morning rush",color='pink')
plt.axvspan(16, 19, alpha=0.15, label="Evening rush")
plt.axvspan(20, 23, alpha=0.15, label="Evening social hours",color='blue')
plt.legend()

peak_hour = error_by_hour.idxmax()

plt.annotate(
    "Peak error (high volatility)",
    xy=(peak_hour, error_by_hour[peak_hour]),
    xytext=(peak_hour - 6, error_by_hour.max() - 1),
    arrowprops=dict(arrowstyle="->")
)


plt.show()

df_errors.sort_values("abs_error", ascending=False).head(10)

def simulate_zone_forecast_walkforward(
    df_model: pd.DataFrame,
    model,
    X_columns,
    pu_location_id: int,
    start_ts: str,
    horizon: int ,
):
    start_ts = pd.Timestamp(start_ts)

    # --- 1) récupérer la série historique de cette zone (jusqu'à start_ts inclus)
    zone_hist = df_model[df_model["pu_location_id"] == pu_location_id].copy()
    zone_hist = zone_hist.sort_values("hour_ts")

    if zone_hist.empty:
        raise ValueError("Zone inconnue dans df_model.")

    # On doit avoir le point start_ts
    if start_ts not in set(zone_hist["hour_ts"]):
        raise ValueError("start_ts n'existe pas pour cette zone. Choisis un timestamp existant.")

    # On prend l'index du start
    zone_hist = zone_hist.set_index("hour_ts")
    t0 = start_ts

    # Valeur vraie à t0 (utile pour démo)
    y_true_t0 = float(zone_hist.loc[t0, "trips"])

    # --- 2) buffer d'historique : on prend les 168 dernières heures (si dispo)
    hist_series = zone_hist.loc[:t0, "trips"].copy()
    hist_series = hist_series.tail(168)  # 168h = 1 semaine

    # helper: construire une ligne X pour un timestamp donné
    def build_features(ts, hist):
        hour = ts.hour
        weekday = ts.weekday()  # 0..6
        is_weekend = weekday >= 5
        month = ts.month

        # lags (si pas assez d'historique, NaN)
        lag_1h = hist.iloc[-1] if len(hist) >= 1 else np.nan
        lag_24h = hist.iloc[-24] if len(hist) >= 24 else np.nan
        lag_168h = hist.iloc[-168] if len(hist) >= 168 else np.nan

        roll_24h = hist.tail(24).mean() if len(hist) >= 24 else np.nan
        roll_168h = hist.tail(168).mean() if len(hist) >= 168 else np.nan

        X = {
            "hour": hour,
            "weekday": weekday,
            "is_weekend": is_weekend,
            "month": month,
            "trips_lag_1h": lag_1h,
            "trips_lag_24h": lag_24h,
            "trips_lag_168h": lag_168h,
            "trips_roll_mean_24h": roll_24h,
            "trips_roll_mean_168h": roll_168h,
        }

        # borough dummies: on reprend depuis df_model à t0 (zone fixe)
        # on copie toutes les colonnes borough_* depuis une ligne existante (celle de t0)
        row_t0 = zone_hist.loc[t0]
        for c in X_columns:
            if c.startswith("borough_"):
                X[c] = bool(row_t0.get(c, False))

        X_df = pd.DataFrame([X])

        # compléter colonnes manquantes
        for c in X_columns:
            if c not in X_df.columns:
                X_df[c] = 0

        return X_df[X_columns]

    preds = []
    hist = hist_series.copy()

    # --- 3) walk-forward prediction
    for step in range(1, horizon + 1):
        ts_next = t0 + pd.Timedelta(hours=step)

        X_next = build_features(ts_next, hist)
        y_hat = float(model.predict(X_next)[0])

        # (option) éviter négatifs
        y_hat = max(0.0, y_hat)

        preds.append({
            "forecast_for": ts_next,
            "pu_location_id": pu_location_id,
            "pred_trips": y_hat,
            "used_lag_1h": float(X_next["trips_lag_1h"].iloc[0]),
            "used_lag_24h": float(X_next["trips_lag_24h"].iloc[0]) if not pd.isna(X_next["trips_lag_24h"].iloc[0]) else None,
            "used_roll_24h": float(X_next["trips_roll_mean_24h"].iloc[0]) if not pd.isna(X_next["trips_roll_mean_24h"].iloc[0]) else None,
        })

        # update historique avec la prédiction
        hist = pd.concat([hist, pd.Series([y_hat])], ignore_index=True)
        hist = hist.tail(168)  # garder max 168h

    return pd.DataFrame(preds), y_true_t0

pu = 237            # tu mets une zone
ts = "2024-07-05 18:00:00"   # tu mets une date/heure existante

forecast_df, y_true_t0 = simulate_zone_forecast_walkforward(
    df_model=df_model,
    model=rf,
    X_columns=X_train.columns,
    pu_location_id=pu,
    start_ts=ts,
    horizon=24
)

y_true_t0, forecast_df

(163.0,
           forecast_for  pu_location_id  pred_trips  used_lag_1h  used_lag_24h  \
 0  2024-07-05 19:00:00             237  155.331634   163.000000          97.0   
 1  2024-07-05 20:00:00             237  135.194161   155.331634          82.0   
 2  2024-07-05 21:00:00             237  122.112945   135.194161          72.0   
 3  2024-07-05 22:00:00             237   95.154547   122.112945          80.0   
 4  2024-07-05 23:00:00             237   71.640927    95.154547          42.0   
 5  2024-07-06 00:00:00             237   48.417043    71.640927          43.0   
 6  2024-07-06 01:00:00             237   29.749044    48.417043          16.0   
 7  2024-07-06 02:00:00             237   14.740655    29.749044           3.0   
 8  2024-07-06 03:00:00             237    7.712255    14.740655           2.0   
 9  2024-07-06 04:00:00             237    6.008172     7.712255           3.0   
 10 2024-07-06 05:00:00             237    5.849520     6.008172           5.0   
 11 2024-07-06 06:00:00             237   10.628500     5.849520          10.0   
 12 2024-07-06 07:00:00             237   19.451888    10.628500          28.0   
 13 2024-07-06 08:00:00             237   42.179179    19.451888          72.0   
 14 2024-07-06 09:00:00             237   65.190020    42.179179          80.0   
 15 2024-07-06 10:00:00             237   96.089547    65.190020         108.0   
 16 2024-07-06 11:00:00             237  128.687549    96.089547         175.0   
 17 2024-07-06 12:00:00             237  149.872694   128.687549         196.0   
 18 2024-07-06 13:00:00             237  164.386619   149.872694         192.0   
 19 2024-07-06 14:00:00             237  180.989126   164.386619         203.0   
 20 2024-07-06 15:00:00             237  187.273025   180.989126         198.0   
 21 2024-07-06 16:00:00             237  179.578330   187.273025         194.0   
 22 2024-07-06 17:00:00             237  186.537123   179.578330         179.0   
 23 2024-07-06 18:00:00             237  190.761476   186.537123         163.0   
 
     used_roll_24h  
 0       93.458333  
 1       95.888818  
 2       98.105241  
 3      100.193281  
 4      100.824720  
 5      102.059759  
 6      102.285469  
 7      102.858346  
 8      103.347540  
 9      103.585550  
 10     103.710891  
 11     103.746288  
 12     103.772475  
 13     103.416304  
 14     102.173770  
 15     101.556687  
 16     101.060418  
 17      99.130733  
 18      97.208762  
 19      96.058204  
 20      95.141084  
 21      94.694127  
 22      94.093224  
 23      94.407271  )

plt.figure(figsize=(8,4))
plt.plot(forecast_df["forecast_for"], forecast_df["pred_trips"], marker="o", color="gold")
plt.title(f"Forecast - zone {pu} (next hours)")
plt.xlabel("Time")
plt.ylabel("Predicted trips")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()

#bundle model pour démo 

import joblib

bundle = {
    "model": rf,
    "feature_columns": list(X_train.columns)
}

joblib.dump(bundle, "model_bundle.pkl")

['model_bundle.pkl']

#dataset léger pour Streamlit (on ne veut pas tout le df_model mais juste ce qu'il faut)

borough_cols = [c for c in df_model.columns if c.startswith("borough_")]

df_app = df_model[["hour_ts", "pu_location_id", "trips"] + borough_cols].copy()
df_app["hour_ts"] = pd.to_datetime(df_app["hour_ts"])

df_app.to_parquet("df_app.parquet", index=False)

# Modèle léger uniquement pour la démo Streamlit (déploiement)
rf_demo = RandomForestRegressor(
    n_estimators=30,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_demo.fit(X_train, y_train)

bundle_demo = {
    "model": rf_demo,
    "feature_columns": list(X_train.columns)
}

joblib.dump(bundle_demo, "model_bundle_demo.pkl")

['model_bundle_demo.pkl']

	hour_ts	pu_location_id	trips	avg_trip_distance	avg_total_amount	pct_cash	pct_card
0	2022-01-26 09:00:00	87	21	4.330952	23.917619	19.047619	71.428571
1	2022-01-26 13:00:00	89	1	5.200000	27.000000	0.000000	100.000000
2	2022-01-27 06:00:00	93	1	0.210000	174.360000	0.000000	100.000000
3	2022-01-26 07:00:00	42	8	2.378750	14.601250	62.500000	37.500000
4	2022-01-24 16:00:00	50	42	2.923333	18.084750	30.952381	59.523810

	pu_location_id	borough	zone
0	1	EWR	Newark Airport
1	2	Queens	Jamaica Bay
2	3	Bronx	Allerton/Pelham Gardens
3	4	Manhattan	Alphabet City
4	5	Staten Island	Arden Heights

	hour_ts	pu_location_id	trips	avg_trip_distance	avg_total_amount	pct_cash	pct_card	borough	zone
0	2022-01-26 09:00:00	87	21	4.330952	23.917619	19.047619	71.428571	Manhattan	Financial District North
1	2022-01-26 13:00:00	89	1	5.200000	27.000000	0.000000	100.000000	Brooklyn	Flatbush/Ditmas Park
2	2022-01-27 06:00:00	93	1	0.210000	174.360000	0.000000	100.000000	Queens	Flushing Meadows-Corona Park
3	2022-01-26 07:00:00	42	8	2.378750	14.601250	62.500000	37.500000	Manhattan	Central Harlem North
4	2022-01-24 16:00:00	50	42	2.923333	18.084750	30.952381	59.523810	Manhattan	Clinton West

	hour_ts	pu_location_id	trips	avg_trip_distance	avg_total_amount	pct_cash	pct_card	borough	zone	pct_sum	hour	weekday	is_weekend	month	year
2639147	2024-11-03 01:00:00	79	1239	2.428515	22.308139	5.649718	84.907183	Manhattan	East Village	90.556901	1	6	True	11	2024
737986	2022-11-06 01:00:00	79	1115	2.931973	18.220752	6.816143	89.955157	Manhattan	East Village	96.771300	1	6	True	11	2022
1615645	2023-11-05 01:00:00	79	1106	2.291646	23.086694	6.871609	81.826401	Manhattan	East Village	88.698011	1	6	True	11	2023
2639183	2024-11-03 01:00:00	148	879	2.713675	23.967356	4.323094	84.414107	Manhattan	Lower East Side	88.737201	1	6	True	11	2024
2639232	2024-11-03 01:00:00	249	862	2.336462	22.369906	6.844548	79.930394	Manhattan	West Village	86.774942	1	6	True	11	2024
2037237	2024-04-14 01:00:00	79	849	2.554723	22.068698	4.240283	64.428740	Manhattan	East Village	68.669022	1	6	True	4	2024
1896413	2024-02-25 01:00:00	79	846	2.239421	21.546734	5.437352	65.602837	Manhattan	East Village	71.040189	1	6	True	2	2024
1615710	2023-11-05 01:00:00	249	837	2.295006	23.515884	4.181601	85.185185	Manhattan	West Village	89.366786	1	6	True	11	2023
2780088	2024-12-18 18:00:00	237	830	1.469518	24.069842	8.795181	82.409639	Manhattan	Upper East Side South	91.204819	18	2	False	12	2024
1896306	2024-02-25 00:00:00	79	803	2.117572	21.978279	5.479452	67.372354	Manhattan	East Village	72.851806	0	6	True	2	2024

	hour_ts	pu_location_id	trips	avg_trip_distance	avg_total_amount	pct_cash	pct_card	borough	zone	pct_sum
0	2022-01-01	4	11	3.200000	18.520909	54.545455	45.454545	Manhattan	Alphabet City	100.000000
1	2022-01-01	7	6	3.531667	15.353333	50.000000	16.666667	Queens	Astoria	66.666667
2	2022-01-01	10	1	7.910000	25.300000	0.000000	100.000000	Queens	Baisley Park	100.000000
3	2022-01-01	12	2	6.640000	29.025000	50.000000	50.000000	Manhattan	Battery Park	100.000000
4	2022-01-01	13	12	4.325000	22.370000	25.000000	66.666667	Manhattan	Battery Park City	91.666667

	zone	hour_ts	trips	trips_lag_1h	trips_lag_24h	trips_lag_168h
404	Allerton/Pelham Gardens	2022-01-01 04:00:00	1	NaN	NaN	NaN
1207	Allerton/Pelham Gardens	2022-01-01 13:00:00	1	1.0	NaN	NaN
9541	Allerton/Pelham Gardens	2022-01-05 12:00:00	1	1.0	NaN	NaN
9649	Allerton/Pelham Gardens	2022-01-05 13:00:00	1	1.0	NaN	NaN
10062	Allerton/Pelham Gardens	2022-01-05 17:00:00	1	1.0	NaN	NaN
...	...	...	...	...	...	...
2819245	Yorkville West	2024-12-31 19:00:00	151	147.0	51.0	91.0
2819395	Yorkville West	2024-12-31 20:00:00	238	151.0	64.0	101.0
2819577	Yorkville West	2024-12-31 21:00:00	252	238.0	78.0	90.0
2819772	Yorkville West	2024-12-31 22:00:00	196	252.0	58.0	91.0
2819945	Yorkville West	2024-12-31 23:00:00	97	196.0	37.0	53.0

	borough_Bronx	borough_Brooklyn	borough_EWR	borough_Manhattan	borough_N/A	borough_Queens	borough_Staten Island	borough_Unknown
293621	True	False	False	False	False	False	False	False
301601	True	False	False	False	False	False	False	False
301836	True	False	False	False	False	False	False	False
304465	True	False	False	False	False	False	False	False
306822	True	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...
2819245	False	False	False	True	False	False	False	False
2819395	False	False	False	True	False	False	False	False
2819577	False	False	False	True	False	False	False	False
2819772	False	False	False	True	False	False	False	False
2819945	False	False	False	True	False	False	False	False

	hour	weekday	is_weekend	month	trips_lag_1h	trips_lag_24h	trips_lag_168h	trips_roll_mean_24h	trips_roll_mean_168h	borough_Bronx	borough_Brooklyn	borough_EWR	borough_Manhattan	borough_N/A	borough_Queens	borough_Staten Island	borough_Unknown
293621	23	5	True	5	1.0	1.0	1.0	1.041667	1.071429	True	False	False	False	False	False	False	False
301601	9	2	False	5	1.0	1.0	1.0	1.041667	1.071429	True	False	False	False	False	False	False	False
301836	11	2	False	5	1.0	1.0	1.0	1.041667	1.071429	True	False	False	False	False	False	False	False
304465	13	3	False	5	1.0	1.0	1.0	1.041667	1.071429	True	False	False	False	False	False	False	False
306822	12	4	False	5	1.0	1.0	1.0	1.041667	1.071429	True	False	False	False	False	False	False	False

	feature	importance
4	trips_lag_1h	9.126270e-01
5	trips_lag_24h	3.356686e-02
0	hour	1.605595e-02
6	trips_lag_168h	1.454877e-02
7	trips_roll_mean_24h	9.257865e-03
8	trips_roll_mean_168h	6.347834e-03
1	weekday	3.522569e-03
3	month	2.159529e-03
14	borough_Queens	7.753297e-04
12	borough_Manhattan	6.087721e-04
2	is_weekend	5.031331e-04
16	borough_Unknown	1.863250e-05
13	borough_N/A	4.122380e-06
10	borough_Brooklyn	3.400801e-06
11	borough_EWR	1.267095e-07
9	borough_Bronx	9.415391e-08
15	borough_Staten Island	2.992754e-09

	hour	weekday	is_weekend	month	trips_lag_1h	trips_lag_24h	trips_lag_168h	trips_roll_mean_24h	trips_roll_mean_168h	borough_Bronx	...	borough_EWR	borough_Manhattan	borough_N/A	borough_Queens	borough_Staten Island	borough_Unknown	y_true	y_pred	error	abs_error
1758081	1	0	False	1	1.0	1.0	1.0	1.041667	1.095238	True	...	False	False	False	False	False	False	1	1.013093	-0.013093	0.013093
1761180	6	1	False	1	1.0	1.0	1.0	1.041667	1.095238	True	...	False	False	False	False	False	False	1	1.032762	-0.032762	0.032762
1761310	7	1	False	1	1.0	1.0	1.0	1.041667	1.095238	True	...	False	False	False	False	False	False	1	1.053056	-0.053056	0.053056
1762133	13	1	False	1	1.0	1.0	1.0	1.041667	1.095238	True	...	False	False	False	False	False	False	1	1.032524	-0.032524	0.032524
1762751	18	1	False	1	1.0	1.0	2.0	1.041667	1.089286	True	...	False	False	False	False	False	False	1	1.057114	-0.057114	0.057114

	hour	weekday	is_weekend	month	trips_lag_1h	trips_lag_24h	trips_lag_168h	trips_roll_mean_24h	trips_roll_mean_168h	borough_Bronx	...	borough_EWR	borough_Manhattan	borough_N/A	borough_Queens	borough_Staten Island	borough_Unknown	y_true	y_pred	error	abs_error
2639147	1	6	True	11	673.0	792.0	794.0	255.625000	134.482143	False	...	False	True	False	False	False	False	1239	714.380000	524.620000	524.620000
1768137	21	3	False	1	308.0	154.0	150.0	186.333333	122.726190	False	...	False	True	False	False	False	False	668	230.862550	437.137450	437.137450
2639183	1	6	True	11	397.0	456.0	350.0	138.291667	66.898810	False	...	False	True	False	False	False	False	879	444.199279	434.800721	434.800721
2761827	21	3	False	12	381.0	697.0	318.0	219.083333	204.571429	False	...	False	True	False	False	False	False	771	342.438322	428.561678	428.561678
2134968	22	4	False	5	259.0	285.0	369.0	179.916667	182.202381	False	...	False	True	False	False	False	False	651	262.688156	388.311844	388.311844
2150218	23	2	False	5	648.0	146.0	588.0	192.250000	174.327381	False	...	False	True	False	False	False	False	168	542.303333	-374.303333	374.303333
1931999	22	4	False	3	269.0	424.0	180.0	162.875000	169.613095	False	...	False	True	False	False	False	False	661	290.020369	370.979631	370.979631
2758773	21	2	False	12	389.0	635.0	310.0	223.291667	201.982143	False	...	False	True	False	False	False	False	697	344.958818	352.041182	352.041182
1823095	22	4	False	1	206.0	324.0	156.0	144.500000	146.404762	False	...	False	True	False	False	False	False	547	198.138736	348.861264	348.861264
2735441	0	2	False	12	381.0	514.0	244.0	246.625000	185.690476	False	...	False	False	False	True	False	False	71	411.842734	-340.842734	340.842734

	Model	MAE	RMSE
0	Baseline (Lag-1)	9.89	23.05
1	Random Forest	6.36	14.66

Project : NYC Yellow Taxi Demand Analysis (2022–2024)

Step 1 : Data Import

Objective :¶

Objectif :¶

1.1 – Libraries import¶

1.2 – Database connection¶

1.3 – Load mart table¶

1.4 – First inspection¶

1.5 – Data scope verification¶

1.6 – Enrich data with taxi zone information¶

Step 2 : Data Cleaning & Preprocessing (Analytical level)

Objective :¶

Objectif :¶

2.1 – Missing values (NA check)¶

2.2 Target Variable Validation (trips)¶

2.3 Payment Percentage Consistency Check¶

2.4 Duplicate Records Verification¶

2.5 Chronological Sorting¶

2.6 Temporal Feature Preparation¶

Step 3 : Exploratory Data Analysis (EDA)

Objective :¶

Objectif :¶

3.1 Global Demand Distribution¶

Question¶

3.2 Hourly Demand Pattern¶

Question¶

3.3 Weekday vs Weekend Demand¶

Question¶

3.4 Spatial Analysis – Demand by Borough¶

Question¶

3.5 Top Pickup Zones¶

Question¶

3.6 Demand Evolution Over Time¶

Question¶

3.7 Key EDA Takeaways - EN¶

3.7 Key EDA Takeaways - FR¶

Step 4 : Feature Engineering

Objective :¶

Objectif :¶

4.1 Define Target and Base Features¶

4.2 Lag Features (historical demand)¶

4.3 Rolling Statistics (Moving Averages)¶

4.4 Handle NaN Values¶

4.5 Encoding Categorical Variables¶

4.6 Final Dataset for Modeling¶

Step 5 : Modeling (ML)

Objective:¶

Objectif:¶

5.1 Train/Test Split (Time-aware)¶

5.2 Baseline Model (Naive Forecast)¶

5.3 Random Forest Regressor¶

5.4 Feature Importance¶

5.5 Error analysis (où le modèle se trompe)¶

5.6 Conclusion & limits¶

Step 6 : Mini simulation de prédiction

Démonstration :¶

Step 7 : Streamlit

déployer l'app sur stremlit cloud¶