X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=..., random_state=...)
# fixer le random state à une valeur entière pour la reproductibilié

transformer = NomTransformation(parametres)
transformer.fit(X_train)
transformer.transform(X_train)

from sklearn.preprocessing import RobustScaler
X_train = [[ 1., -2.,  2.],
     [ -2.,  1.,  3.],
     [ 4.,  1., -2.]]
transformer = RobustScaler().fit(X_train)

transformer.transform(X_train)

array([[ 0. , -2. ,  0. ],
       [-1. ,  0. ,  0.4],
       [ 1. ,  0. , -1.6]])

model = NomDuModele(parametres,hypermarametres)
model.fit(X_train,y_train) # apprentissage supervisé
model.fit(X_train) # apprentissage non supervisé

model = NomDuModele()
model.fit()

param_grid = {
"param_continu_1": [val1,val2,...]
"param_continu_2": [val1,val2,...]
"param_discret_1": ("val1","val2")
...    
}

search = GridSearchCV(model, param_grid, refit=True, cv=5, ...)
search.fit(X_train, y_train)

search = HalvingGridSearchCV(model, param_grid, ressource='n_samples', n_candidates='exhaust', factor=3)
search.fit(X_train, y_train)

param_grid = {
"param_continu_1": scipy.stats.expon(loc=0, scale=100)# ici on tire au hasard des nombres entre 0 et 100 suivant une loi exponentielle
"param_discret_1": ("val1","val2")
...    
}

search = RandomizedSearchCV(model, param_grid, refit=True, cv=5, random_state=42)
search.fit(X_train, y_train)

search = HalvingRandomSearchCV(model, param_grid, resource='n_samples', n_candidates='exhaust', factor=3)
search.fit(X_train, y_train)

search.best_params_
search.best_score_

search.best_estimator_

model.predict(X_test) # predictions sur le jeu de test
model.predict_proba(X_test) # uniquement pour certains modèles
model.score(X_test,y_test) # score calculé sur le jeu de test

from sklearn.metrics import auc
auc(X_test,y_test)

train_sizes, train_scores, test_score = learning_curve(model, X, y)
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Validation")

params = {
    "X": X_train,
    "y": y_train,
    "train_sizes": np.linspace(0.1, 1.0, 5),
    "cv": ShuffleSplit(n_splits=50, test_size=0.2, random_state=0),
    "score_type": "both",
}

display = LearningCurveDisplay.from_estimator(model, **params, ...)

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

svc = SVC(kernel="rbf", gamma=0.001) 
# attention dans la pratique je choisi ici les paramètres déja optimisés

import numpy as np
params = {
    "X": X_train,
    "y": y_train,
    "train_sizes": np.linspace(0.1, 1.0, 5),
    "cv": 5,
    "score_type": "both",
}

from sklearn.model_selection import LearningCurveDisplay
LearningCurveDisplay.from_estimator(svc, **params);

cv_results = cross_validate(model, X, y, cv=...)
cv_results['test_score'] # la clé test score_enregistre le score de chaque entrainement cross-validé

import pandas as pd
import numpy as np

data = pd.read_csv("https://filedn.eu/lefeldrXcsSFgCcgc48eaLY/datasets/regression/data_insurance.csv")
data.head()

X = data.drop(columns='charges')
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pipe.fit_transform(X_train[['age']])[0:3]

# accéder aux étapes
pipe[0]

pipe['imputer']

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Imputation et scaling des variables numériques
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

# Encodage de la feature catégorielle
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# On parallelise les deux traitements "num_transformer" et "cat_transformer"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])])

 # visualisation des pipelines en HTML
from sklearn import set_config; set_config(display='diagram')
preprocessor

X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

preprocessor.feature_names_in_

# bug dans la version 1.0.2: le transformer SimpleImputer n'a pas encore de méthode get_feature_names_out
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

# Nouveau dans scikit-learn 1.0.2
preprocessor.get_feature_names_out()

pd.DataFrame(X_train_transformed,columns = [preprocessor.get_feature_names_out()]).head()

preprocessor

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough')
preprocessor

pd.DataFrame(preprocessor.fit_transform(X_train)
             ,columns = [preprocessor.get_feature_names_out()]).head(3)

from sklearn.preprocessing import FunctionTransformer
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))
rounder.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['bmi', 'age']),
    ('cat_tr', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

from sklearn.pipeline import FeatureUnion

# On crée une nouvelle variable en en multipliant deux 
bmi_age_ratio = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # colonnes 0-8
    ('bmi_age_ratio', bmi_age_ratio) # nouvelle colonne 9
])
union

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# est équivalent à:
make_pipeline(SimpleImputer(), StandardScaler())

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer((num_transformer, ['age', 'bmi']),
                                       (cat_transformer, ['smoker', 'region']),
                                       remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio)
preproc_full

X_train.dtypes

from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio)
preproc_full

from sklearn.linear_model import Ridge

# Pipeline de preprocessing
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

# Ajout du modèle
pipe = make_pipeline(preproc, Ridge())
pipe

# Preprocessing et entrainement du modèle
pipe.fit(X_train,y_train)

# Prédictions
pipe.predict(X_test.iloc[0:2])

# Score
print(f"Score cross-validé moyen sur le train set: {cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()}")
print(f"Score sur le test set:{pipe.score(X_test,y_test)}")

from sklearn.model_selection import GridSearchCV

# On peut afficher tous les paramètres de tout les composants de la pipeline
pipe.get_params().keys()

pipe.get_params()['columntransformer']

grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # grille des hyper paramètres à tester
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

# entraine toute la pipeline et la ré-entraine avec les meilleurs paramètres trouvés
grid_search.fit(X_train, y_train)
grid_search.best_params_

pipe_tuned = grid_search.best_estimator_

from tempfile import mkdtemp
from shutil import rmtree

# Create a temp folder
cachedir = mkdtemp()

# Instantiate the pipeline with cache parameter
pipe = make_pipeline(preproc, Ridge(), memory=cachedir)

# Clear the cache directory after the cross-validation
rmtree(cachedir)

# acceder a chacun des composants
pipe_tuned.named_steps.keys()

# vérifier une étape intermédiaire
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

# le module pickle de python permet de sauvegarder n'importe quel objet
import pickle

# spécifier le path pour le fichier final
from pathlib import Path
import os
export_path = Path("/home/nico/code/demos/")
os.path.join(export_path,'test')

# exporter la pipeline
export_path = "/home/nico/code/demos/"
with open(os.path.join(export_path,"pipeline.pkl"), "wb") as file:
    pickle.dump(pipe_tuned, file)

# recharger la pipeline
my_pipeline = pickle.load(open(os.path.join(export_path,"pipeline.pkl"),"rb"))

# faire une prédiction avec la pipeline entrainée
my_pipeline.score(X_test, y_test)

Leçon: Chaîne de traitement (workflow) en machine learning¶

Les différentes étapes¶

Bonnes pratiques (minimales) pour la construction de votre chaine de traitement¶

Etapes de traitement quasi-indispensables (pseudo-code)¶

Utiliser la méthode hold-out¶

Pour apprendre et appliquer une transformation (souvent pendant le pré-traitement)¶

Exemple : appliquer un `RobustScaler` aux données¶

Pour entrainer un modèle¶

Attention à toujours faire une optimisation ! ⚠️⚠️¶

Si vous n'avez pas de méthode d'optimisation spécifique: utilisez les méthode de type GridSearchCV ¶

Pour le `GridSearchCV`¶

Pour le `HalvingSearchCV`¶

Pour le `RandomizedSearchCV` ou le HalvingRandomSearchCV ¶

Evaluer un modèle entraîné¶

Représenter une courbe d'apprentissage ¶

Exemple avec un SVC entraîné sur le dataset digits:¶

Utiliser la validation croisée pour évaluer un modèle¶

Améliorer les données d'entrées¶

La réduction de dimensionalité¶

La sélection de features¶

L'ingénieurie de features¶

Pipelines dans scikit-learn¶

Exemples d'utilisation¶

Imputation et scaling¶

Column transformer ¶

FeatureUnion ¶

Quelques raccourcis¶

Rajoutons l'entrainement du modèle à notre pipeline¶

Rajout d'un modèle Ridge¶

Entrainement et résultats¶

Grid Search dans une pipeline¶

Mettre des transformations en cache pour economiser du temps de calcul¶

Débuger sa pipeline¶

Etapes finales¶

Exporter sa pipeline entrainée¶

ToDo: ajouter exemple avec mlflow¶

ToDo: AutoML demos¶

Leçon: Chaîne de traitement (workflow) en machine learning¶

Les différentes étapes¶

Bonnes pratiques (minimales) pour la construction de votre chaine de traitement¶

Etapes de traitement quasi-indispensables (pseudo-code)¶

Utiliser la méthode hold-out¶

Pour apprendre et appliquer une transformation (souvent pendant le pré-traitement)¶

Exemple : appliquer un RobustScaler aux données¶

Pour entrainer un modèle¶

Attention à toujours faire une optimisation ! ⚠️⚠️¶

Si vous n'avez pas de méthode d'optimisation spécifique: utilisez les méthode de type GridSearchCV¶

Pour le GridSearchCV¶

Pour le HalvingSearchCV¶

Pour le RandomizedSearchCV ou le HalvingRandomSearchCV¶

Evaluer un modèle entraîné¶

Représenter une courbe d'apprentissage¶

Exemple avec un SVC entraîné sur le dataset digits:¶

Utiliser la validation croisée pour évaluer un modèle¶

Améliorer les données d'entrées¶

La réduction de dimensionalité¶

La sélection de features¶

L'ingénieurie de features¶

Pipelines dans scikit-learn¶

Exemples d'utilisation¶

Imputation et scaling¶

Column transformer¶

FeatureUnion¶

Quelques raccourcis¶

Rajoutons l'entrainement du modèle à notre pipeline¶

Rajout d'un modèle Ridge¶

Entrainement et résultats¶

Grid Search dans une pipeline¶

Mettre des transformations en cache pour economiser du temps de calcul¶

Débuger sa pipeline¶

Etapes finales¶

Exporter sa pipeline entrainée¶

ToDo: ajouter exemple avec mlflow¶

ToDo: AutoML demos¶

Exemple : appliquer un `RobustScaler` aux données¶

Si vous n'avez pas de méthode d'optimisation spécifique: utilisez les méthode de type GridSearchCV ¶

Pour le `GridSearchCV`¶

Pour le `HalvingSearchCV`¶

Pour le `RandomizedSearchCV` ou le HalvingRandomSearchCV ¶

Représenter une courbe d'apprentissage ¶

Column transformer ¶

FeatureUnion ¶