import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

filepath = "https://filedn.eu/lefeldrXcsSFgCcgc48eaLY/datasets/regression/housing-price_train.csv"
raw_df = pd.read_csv(filepath)
raw_df.drop("Id",axis='columns',inplace=True)
raw_df.head()

# On va travailler sur une copie du data frame pour comparaison avec le data frame brut
df=raw_df.copy()

percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=False,inplace=True)

threshold_view = 2

filtered = percent_missing[percent_missing.values > threshold_view]
ax = sns.barplot(x = filtered, y = filtered.index, orient='h');
ax.set_title(f"Répartition du pourcentage de valeurs manquantes"
             "supérieures au seuil de {threshold_view}%");

threshold = 70

columns_to_drop = percent_missing[
    percent_missing.values > threshold].index

columns_to_drop

Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], dtype='object')

df.drop(columns_to_drop, axis='columns', inplace=True)

df.shape

(1460, 76)

df.dropna(how='all',inplace=False).head(5)

df.shape

(1460, 76)

df.loc[:,"Electrical"].isnull().sum()

1

from sklearn.impute import SimpleImputer
imputation = SimpleImputer(missing_values = np.nan, 
                           strategy = 'most_frequent')
imputation.fit(df["Electrical"].values.reshape(-1,1))

SimpleImputer(strategy='most_frequent')

SimpleImputer(strategy='most_frequent')

imputation.transform(df["Electrical"].\
                     values.reshape(-1,1))

array([['SBrkr'],
       ['SBrkr'],
       ['SBrkr'],
       ...,
       ['SBrkr'],
       ['FuseA'],
       ['SBrkr']], dtype=object)

df.loc[:,"Electrical"] = imputation.transform(df["Electrical"].values.reshape(-1,1))

# check that there is no more missing value
df.loc[:,"Electrical"].isnull().sum()

0

numeric_features = df.select_dtypes(include=['float','int'])
numeric_features.shape

(1460, 37)

from sklearn.impute import KNNImputer
imputation = KNNImputer(missing_values=np.nan)
imputed = imputation.fit_transform(numeric_features)
imputed.shape

(1460, 37)

df.loc[:,numeric_features.columns] = imputed

def compare_dist(feature):
    fig, axes = plt.subplots(1,2,figsize=(12,3))
    sns.histplot(raw_df.loc[:,feature],kde=True, ax=axes[0])
    axes[0].set_title(f"Raw {feature}");

    sns.histplot(df.loc[:,feature],kde=True, ax=axes[1])
    axes[1].set_title(f"Imputed {feature}");

compare_dist("LotFrontage")

import seaborn as sns
ax = sns.boxplot(data=df.loc[:,["1stFlrSF","2ndFlrSF"]]);
ax.set_ylabel("Surface");
ax.set_title("Boxplots utilisant le critère de Tukey");

ax = df.plot(kind='scatter', 
             x="1stFlrSF", 
             y= "2ndFlrSF")
ax.set_title("Surface des étages");

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])

array([1, 1, 2, 0])

oe = preprocessing.OrdinalEncoder()
X = [['Small', 1], ['Tall', 3], ['Tall', 2]] 
oe.fit_transform(X)

array([[0., 0.],
       [1., 2.],
       [1., 1.]])

df.select_dtypes('object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   LotShape       1460 non-null   object
 3   LandContour    1460 non-null   object
 4   Utilities      1460 non-null   object
 5   LotConfig      1460 non-null   object
 6   LandSlope      1460 non-null   object
 7   Neighborhood   1460 non-null   object
 8   Condition1     1460 non-null   object
 9   Condition2     1460 non-null   object
 10  BldgType       1460 non-null   object
 11  HouseStyle     1460 non-null   object
 12  RoofStyle      1460 non-null   object
 13  RoofMatl       1460 non-null   object
 14  Exterior1st    1460 non-null   object
 15  Exterior2nd    1460 non-null   object
 16  MasVnrType     588 non-null    object
 17  ExterQual      1460 non-null   object
 18  ExterCond      1460 non-null   object
 19  Foundation     1460 non-null   object
 20  BsmtQual       1423 non-null   object
 21  BsmtCond       1423 non-null   object
 22  BsmtExposure   1422 non-null   object
 23  BsmtFinType1   1423 non-null   object
 24  BsmtFinType2   1422 non-null   object
 25  Heating        1460 non-null   object
 26  HeatingQC      1460 non-null   object
 27  CentralAir     1460 non-null   object
 28  Electrical     1460 non-null   object
 29  KitchenQual    1460 non-null   object
 30  Functional     1460 non-null   object
 31  FireplaceQu    770 non-null    object
 32  GarageType     1379 non-null   object
 33  GarageFinish   1379 non-null   object
 34  GarageQual     1379 non-null   object
 35  GarageCond     1379 non-null   object
 36  PavedDrive     1460 non-null   object
 37  SaleType       1460 non-null   object
 38  SaleCondition  1460 non-null   object
dtypes: object(39)
memory usage: 445.0+ KB

df.loc[:,['LotShape','HouseStyle']].head()

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df.loc[:,['LotShape','HouseStyle']])

OneHotEncoder(handle_unknown='ignore')

OneHotEncoder(handle_unknown='ignore')

enc.categories_

[array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story',
        'SFoyer', 'SLvl'], dtype=object)]

encoded_features = enc.transform(df.loc[:,['LotShape','HouseStyle']]).toarray()
encoded_features

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

encoded_features_names = enc.get_feature_names_out()
encoded_features_names

array(['LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg',
       'HouseStyle_1.5Fin', 'HouseStyle_1.5Unf', 'HouseStyle_1Story',
       'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'HouseStyle_2Story',
       'HouseStyle_SFoyer', 'HouseStyle_SLvl'], dtype=object)

pd.DataFrame(encoded_features,
            columns = encoded_features_names).head()

import numpy as np
import seaborn as sns
from mlxtend.preprocessing import minmax_scaling
import matplotlib.pyplot as plt

# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size=1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1,2);
sns.histplot(original_data, ax=ax[0]);
ax[0].set_title("Original Data");
sns.histplot(scaled_data, ax=ax[1]);
ax[1].set_title("Scaled data");

from scipy import stats

# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax=plt.subplots(1,2);
sns.histplot(original_data, ax=ax[0]);
ax[0].set_title("Original Data")
sns.histplot(normalized_data[0], ax=ax[1]);
ax[1].set_title("Normalized data");

import pandas as pd
import numpy as np
pd.cut(np.array([1, 7, 5, 4, 6, 3]),3, labels=["bad", "medium", "good"])

['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']

df['SalePriceBinary'] = pd.cut(x = df['SalePrice'],
                       bins=[df['SalePrice'].min()-1,
                             df['SalePrice'].mean(),
                             df['SalePrice'].max()+1], 
                       labels=['cheap', 'expensive'])

df.loc[0:3,['SalePrice','SalePriceBinary']]

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X, y = load_iris(return_X_y=True)
X.shape

(150, 4)

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

(150, 2)

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	LotShape	LandContour	Utilities	LotConfig	LandSlope	...	EnclosedPorch	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	65.0	8450	Pave	Reg	Lvl	AllPub	Inside	Gtl	...	0	2	2008	WD	Normal	208500
1	20	RL	80.0	9600	Pave	Reg	Lvl	AllPub	FR2	Gtl	...	0	5	2007	WD	Normal	181500
2	60	RL	68.0	11250	Pave	IR1	Lvl	AllPub	Inside	Gtl	...	0	9	2008	WD	Normal	223500
3	70	RL	60.0	9550	Pave	IR1	Lvl	AllPub	Corner	Gtl	...	272	2	2006	WD	Abnorml	140000
4	60	RL	84.0	14260	Pave	IR1	Lvl	AllPub	FR2	Gtl	...	0	12	2008	WD	Normal	250000

	LotShape_IR1	LotShape_Reg	HouseStyle_1Story	HouseStyle_2Story
0	0.0	1.0	0.0	1.0
1	0.0	1.0	1.0	0.0
2	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	1.0
4	1.0	0.0	0.0	1.0

	LotShape_IR1	LotShape_Reg	HouseStyle_1Story	HouseStyle_2Story
0	0.0	1.0	0.0	1.0
1	0.0	1.0	1.0	0.0
2	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	1.0
4	1.0	0.0	0.0	1.0

	SalePrice	SalePriceBinary
0	208500	expensive
1	181500	expensive
2	223500	expensive
3	140000	cheap

	LotShape_IR1	LotShape_Reg	HouseStyle_1Story	HouseStyle_2Story
0	0.0	1.0	0.0	1.0
1	0.0	1.0	1.0	0.0
2	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	1.0
4	1.0	0.0	0.0	1.0

Leçon : Preparation de données pour des traitements statistiques¶

Rappel des types d'erreurs courantes¶

Différentes étapes de préparation des données¶

Exemple avec un dataset connu¶

Nettoyage minimal des données

Gestion des doublons¶

Gestion des valeurs manquantes et les outliers¶

Ne rien faire (et travailler avec un gruyère):¶

Supprimer certaines variables:¶

Supprimer des observations:¶

Imputer les valeurs manquantes (a utiliser avec précaution !) :¶

Pour des variables numériques :¶

Imputation unviariée¶

Imputation multivariée¶

Imputation univariée des variables catégorielles :¶

Exemple : Imputation de la variable Electrical par la valeur la plus frequente¶

Imputation des variables numériques restantes¶

Effet de l'imputation¶

Dangers de l'imputation¶

Les outliers¶

Identification et traitement des outliers¶

Les identifier supprimer manuellement (par exploration des données)¶

Avec le critère de Tukey¶

Avec des méthodes automatiques¶

Conserver ces valeurs ?¶

Etapes de traitements supplémentaires

Encodage des variables catégorielles¶

Encodage des labels (target)¶

Encodage des features¶

OHE et la dimensionalité¶

Alternatives a OHE¶

Scaling des données numériques¶

La standardisation¶

La normalisation¶

Le scaling robuste aux outliers¶

Autres méthodes de scaling¶

Equilibrage des données (balancing)¶

Méthodes de ré-échantillonnage aléatoire¶

Méthodes de ré-échantillonnage avancées¶

Illustration de l'application de ces méthodes de ré-échantillonnage avec SMOTE¶

Discretisation des données¶

Améliorer l'information contenue dans le data set

Création de features¶

Sélection de features¶

Avantages¶

Sélection de features univariées¶

Corrélation de Pearson¶

Par seuil: suppression des features moins bien classés¶

Sélection de features multi-variées¶

Méthodes Intrinsèques¶

Les méthodes de type wrapper¶

Méthodes Séquentielles¶

La réduction de dimensionalité¶

Attention au data leakage !¶

Exemple : Imputation de la variable `Electrical` par la valeur la plus frequente¶

Illustration de l'application de ces méthodes de ré-échantillonnage avec SMOTE ¶

Sélection de features univariées ¶

Méthodes Intrinsèques ¶

Méthodes Séquentielles ¶

	LotShape_IR1	LotShape_Reg	HouseStyle_1Story	HouseStyle_2Story
0	0.0	1.0	0.0	1.0
1	0.0	1.0	1.0	0.0
2	1.0	0.0	0.0	1.0
3	1.0	0.0	0.0	1.0
4	1.0	0.0	0.0	1.0