---
title: Introduction à l'IA
tags: ESIEE, I4
author: Rémi Maubanc
---
# Contenu du contrôle
- Exercices de Kaggle
- Machine Learning
- Deep Learning
- Expliquer les types d'algorithmes
- Pas d'exo de programmation
- Qu'est-ce qu'un CNN ?
- Definition, definition d'une convolution, usage
- Convolutional Neural Network
- On filtre les entrées avec une convolution
- Qu'est-ce qu'un RNN ?
- Recurrent Neural Network
- Boucle sur eux-même qui permet d'avoir une "mémoire"
- Qu'est-ce qu'une convolution ?
-
# Exercices Kaggle
## Basic Data exploration
Charger des données depuis un fichier CSV et afficher des données utiles avec pandas.
```python=
from pandas import pd
# Chargement des données
data = pd.read_csv(path_to_file)
# Affichage d'un récapitulatif
data.describe()
# Obtention de la moyenne de la colonne test arrondi
round(data["test"].mean())
# Obtention du maximum
data["test"].max()
# Liste des colonnes
data.columns
```
## First Machine Learning Model
```python=
# Selectionner une liste de colonnes
list_colonnes = ["Votai", "Test"]
X = data[list_colonnes]
# Affichage des statistiques de X
X.describe()
# Affichage des entêtes
X.head()
# Création d'un arbre de décision régressif
from sklearn.tree import DecisionTreeRegressor
iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(X, y)
# Création d'une prédiction (avec présentation)
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(iowa_model.predict(X.head()))
# Création d'une prédiction (sans présentation)
predictions = iowa_model.predict(X)
print(predictions)
```
## Model validation
```python=
# Séparer les données en plusieurs morceaux
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
# Creation de l'arbre sur le fragment
iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(train_X, train_y)
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)
# Calcul de la moyenne absolue de l'erreur dans la validation des données
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)
print(val_mae)
```
## Underfitting and Overfitting
```python=
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
# Sélection de la meilleure taille
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
best_nodes_size = float('inf')
best_tree_size = 0
for max_leaf_nodes in candidate_max_leaf_nodes:
my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
if my_mae < best_nodes_size:
best_nodes_size = my_mae
best_tree_size = max_leaf_nodes
```
## Random Forest
```python=
from sklearn.ensemble import RandomForestRegressor
# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
# fit your model
rf_model.fit(train_X, train_y)
# Calculate the mean absolute error of your Random Forest model on the validation data
melb_preds = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, melb_preds)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))
```
## Missing values
```python=
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
# Make new columns indicating what will be imputed
for col in cols_with_missing:
X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))
# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns
print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))
# Shape of training data (num_rows, num_columns)
print(X_train.shape)
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
```
## Categorical Variables
```python=
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
X_train.head()
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)
# Fill in the lines below: drop columns in training and validation data
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
# Check your answers
step_1.check()
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())
# Check your answer (Run this code cell to receive credit!)
step_2.a.check()
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if
set(X_valid[col]).issubset(set(X_train[col]))]
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
from sklearn.preprocessing import OrdinalEncoder
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)
# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])
# Check your answer
step_2.b.check()
print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])
# Fill in the line below: How many categorical variables in the training data
# have cardinality greater than 10?
high_cardinality_numcols = 3
# Fill in the line below: How many columns are needed to one-hot encode the
# 'Neighborhood' variable in the training data?
num_cols_neighborhood = 25
# Check your answers
step_3.a.check()
# Fill in the line below: How many entries are added to the dataset by
# replacing the column with a one-hot encoding?
OH_entries_added = 1e4*100 - 1e4
# Fill in the line below: How many entries are added to the dataset by
# replacing the column with an ordinal encoding?
label_entries_added = 0
# Check your answers
step_3.b.check()
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
from sklearn.preprocessing import OneHotEncoder
# Use# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
# Check your answer
step_4.check()
```
## Pipelines
```python=
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)
# Check your answer
step_1.a.check()
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)
# Check your answer
step_1.b.check()
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test)
# Check your answer
step_2.check()
```
## Cross-Validation
```python=
def get_score(n_estimators):
my_pipeline = Pipeline(steps=[
('preprocessor', SimpleImputer()),
('model', RandomForestRegressor(n_estimators, random_state=0))
])
scores = -1 * cross_val_score(my_pipeline, X, y,
cv=3,
scoring='neg_mean_absolute_error')
return scores.mean()
# Check your answer
step_1.check()
results = {}
for i in range(1,9):
results[50*i] = get_score(50*i)
# Check your answer
step_2.check()
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(list(results.keys()), list(results.values()))
plt.show()
n_estimators_best = min(results, key=results.get)
# Check your answer
step_3.check()
```
## XGBoost
```python=
from xgboost import XGBRegressor
# Define the model
my_model_1 = XGBRegressor(random_state=0)
# Fit the model
my_model_1.fit(X_train, y_train)
# Check your answer
step_1.a.check()
from sklearn.metrics import mean_absolute_error
# Get predictions
predictions_1 = my_model_1.predict(X_valid)
# Check your answer
step_1.b.check()
# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)
print("Mean Absolute Error:" , mae_1)
# Check your answer
step_1.c.check()
# Define the model
my_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# Fit the model
my_model_2.fit(X_train, y_train)
# Get predictions
predictions_2 = my_model_2.predict(X_valid)
# Calculate MAE
mae_2 = mean_absolute_error(predictions_2, y_valid)
print("Mean Absolute Error:" , mae_2)
step_2.check()
# Define the model
my_model_3 = XGBRegressor(n_estimators=1)
# Fit the model
my_model_3.fit(X_train, y_train)
# Get predictions
predictions_3 = my_model_3.predict(X_valid)
# Calculate MAE
mae_3 = mean_absolute_error(predictions_3, y_valid)
print("Mean Absolute Error:" , mae_3)
# Check your answer
step_3.check()
```
## A Single Neuron
```python=
input_shape = [11]
# Check your answer
q_1.check()
from tensorflow import keras
from tensorflow.keras import layers
# YOUR CODE HERE
model = keras.Sequential([
layers.Dense(units=1, input_shape=[11])
])
# Check your answer
q_2.check()
# YOUR CODE HERE
w, b = model.weights
# Check your answer
q_3.check()
```
## Deep Neural Networks
```python=
# YOUR CODE HERE
input_shape = [8]
# Check your answer
q_1.check()
from tensorflow import keras
from tensorflow.keras import layers
# YOUR CODE HERE
model = keras.Sequential([
layers.Dense(512, activation='relu', input_shape=input_shape),
layers.Dense(512, activation='relu'),
layers.Dense(512, activation='relu'),
layers.Dense(1),
])
# Check your answer
q_2.check()
model = keras.Sequential([
layers.Dense(32, input_shape=[8]),
layers.Activation('relu'),
layers.Dense(32),
layers.Activation('relu'),
layers.Dense(1),
])
# Check your answer
q_3.check()
```
## Stochastic Gradient Descent
```python=
model.compile(
optimizer='adam',
loss='mae'
)
# Check your answer
q_1.check()
history = history = model.fit(
X, y,
batch_size=128,
epochs=200
)
# Check your answer
q_2.check()
import pandas as pd
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5. You can change this to get a different view.
history_df.loc[5:, ['loss']].plot();
```