--- title: Introduction à l'IA tags: ESIEE, I4 author: Rémi Maubanc --- # Contenu du contrôle - Exercices de Kaggle - Machine Learning - Deep Learning - Expliquer les types d'algorithmes - Pas d'exo de programmation - Qu'est-ce qu'un CNN ? - Definition, definition d'une convolution, usage - Convolutional Neural Network - On filtre les entrées avec une convolution - Qu'est-ce qu'un RNN ? - Recurrent Neural Network - Boucle sur eux-même qui permet d'avoir une "mémoire" - Qu'est-ce qu'une convolution ? - # Exercices Kaggle ## Basic Data exploration Charger des données depuis un fichier CSV et afficher des données utiles avec pandas. ```python= from pandas import pd # Chargement des données data = pd.read_csv(path_to_file) # Affichage d'un récapitulatif data.describe() # Obtention de la moyenne de la colonne test arrondi round(data["test"].mean()) # Obtention du maximum data["test"].max() # Liste des colonnes data.columns ``` ## First Machine Learning Model ```python= # Selectionner une liste de colonnes list_colonnes = ["Votai", "Test"] X = data[list_colonnes] # Affichage des statistiques de X X.describe() # Affichage des entêtes X.head() # Création d'un arbre de décision régressif from sklearn.tree import DecisionTreeRegressor iowa_model = DecisionTreeRegressor(random_state=1) iowa_model.fit(X, y) # Création d'une prédiction (avec présentation) print("Making predictions for the following 5 houses:") print(X.head()) print("The predictions are") print(iowa_model.predict(X.head())) # Création d'une prédiction (sans présentation) predictions = iowa_model.predict(X) print(predictions) ``` ## Model validation ```python= # Séparer les données en plusieurs morceaux from sklearn.model_selection import train_test_split train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Creation de l'arbre sur le fragment iowa_model = DecisionTreeRegressor(random_state=1) iowa_model.fit(train_X, train_y) # Predict with all validation observations val_predictions = iowa_model.predict(val_X) # Calcul de la moyenne absolue de l'erreur dans la validation des données from sklearn.metrics import mean_absolute_error val_mae = mean_absolute_error(val_y, val_predictions) print(val_mae) ``` ## Underfitting and Overfitting ```python= from sklearn.metrics import mean_absolute_error from sklearn.tree import DecisionTreeRegressor def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y) preds_val = model.predict(val_X) mae = mean_absolute_error(val_y, preds_val) return(mae) # Sélection de la meilleure taille candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500] best_nodes_size = float('inf') best_tree_size = 0 for max_leaf_nodes in candidate_max_leaf_nodes: my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae)) if my_mae < best_nodes_size: best_nodes_size = my_mae best_tree_size = max_leaf_nodes ``` ## Random Forest ```python= from sklearn.ensemble import RandomForestRegressor # Define the model. Set random_state to 1 rf_model = RandomForestRegressor(random_state=1) # fit your model rf_model.fit(train_X, train_y) # Calculate the mean absolute error of your Random Forest model on the validation data melb_preds = rf_model.predict(val_X) rf_val_mae = mean_absolute_error(val_y, melb_preds) print("Validation MAE for Random Forest Model: {}".format(rf_val_mae)) ``` ## Missing values ```python= # Get names of columns with missing values cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] # Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1) print("MAE from Approach 1 (Drop columns with missing values):") print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)) from sklearn.impute import SimpleImputer # Imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns print("MAE from Approach 2 (Imputation):") print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid)) # Make copy to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy() # Make new columns indicating what will be imputed for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull() # Imputation my_imputer = SimpleImputer() imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus)) imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus)) # Imputation removed column names; put them back imputed_X_train_plus.columns = X_train_plus.columns imputed_X_valid_plus.columns = X_valid_plus.columns print("MAE from Approach 3 (An Extension to Imputation):") print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid)) # Shape of training data (num_rows, num_columns) print(X_train.shape) # Number of missing values in each column of training data missing_val_count_by_column = (X_train.isnull().sum()) print(missing_val_count_by_column[missing_val_count_by_column > 0]) ``` ## Categorical Variables ```python= import pandas as pd from sklearn.model_selection import train_test_split # Read the data X = pd.read_csv('../input/train.csv', index_col='Id') X_test = pd.read_csv('../input/test.csv', index_col='Id') # Remove rows with missing target, separate target from predictors X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice X.drop(['SalePrice'], axis=1, inplace=True) # To keep things simple, we'll drop columns with missing values cols_with_missing = [col for col in X.columns if X[col].isnull().any()] X.drop(cols_with_missing, axis=1, inplace=True) X_test.drop(cols_with_missing, axis=1, inplace=True) # Break off validation set from training data X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) X_train.head() from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error # function for comparing different approaches def score_dataset(X_train, X_valid, y_train, y_valid): model = RandomForestRegressor(n_estimators=100, random_state=0) model.fit(X_train, y_train) preds = model.predict(X_valid) return mean_absolute_error(y_valid, preds) # Fill in the lines below: drop columns in training and validation data drop_X_train = X_train.select_dtypes(exclude=['object']) drop_X_valid = X_valid.select_dtypes(exclude=['object']) # Check your answers step_1.check() print("MAE from Approach 1 (Drop categorical variables):") print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)) print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique()) print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique()) # Check your answer (Run this code cell to receive credit!) step_2.a.check() # Categorical columns in the training data object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"] # Columns that can be safely ordinal encoded good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))] # Problematic columns that will be dropped from the dataset bad_label_cols = list(set(object_cols)-set(good_label_cols)) print('Categorical columns that will be ordinal encoded:', good_label_cols) print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols) from sklearn.preprocessing import OrdinalEncoder # Drop categorical columns that will not be encoded label_X_train = X_train.drop(bad_label_cols, axis=1) label_X_valid = X_valid.drop(bad_label_cols, axis=1) # Apply ordinal encoder ordinal_encoder = OrdinalEncoder() label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols]) label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols]) # Check your answer step_2.b.check() print("MAE from Approach 2 (Ordinal Encoding):") print(score_dataset(label_X_train, label_X_valid, y_train, y_valid)) # Get number of unique entries in each column with categorical data object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols)) d = dict(zip(object_cols, object_nunique)) # Print number of unique entries by column, in ascending order sorted(d.items(), key=lambda x: x[1]) # Fill in the line below: How many categorical variables in the training data # have cardinality greater than 10? high_cardinality_numcols = 3 # Fill in the line below: How many columns are needed to one-hot encode the # 'Neighborhood' variable in the training data? num_cols_neighborhood = 25 # Check your answers step_3.a.check() # Fill in the line below: How many entries are added to the dataset by # replacing the column with a one-hot encoding? OH_entries_added = 1e4*100 - 1e4 # Fill in the line below: How many entries are added to the dataset by # replacing the column with an ordinal encoding? label_entries_added = 0 # Check your answers step_3.b.check() # Columns that will be one-hot encoded low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] # Columns that will be dropped from the dataset high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols)) print('Categorical columns that will be one-hot encoded:', low_cardinality_cols) print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols) from sklearn.preprocessing import OneHotEncoder # Use# Apply one-hot encoder to each column with categorical data OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols])) OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols])) # One-hot encoding removed index; put it back OH_cols_train.index = X_train.index OH_cols_valid.index = X_valid.index # Remove categorical columns (will replace with one-hot encoding) num_X_train = X_train.drop(object_cols, axis=1) num_X_valid = X_valid.drop(object_cols, axis=1) # Add one-hot encoded columns to numerical features OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) # Check your answer step_4.check() ``` ## Pipelines ```python= # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') # Preprocessing for categorical data categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ]) # Define model model = RandomForestRegressor(n_estimators=100, random_state=0) # Check your answer step_1.a.check() # Bundle preprocessing and modeling code in a pipeline my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model) ]) # Preprocessing of training data, fit model my_pipeline.fit(X_train, y_train) # Preprocessing of validation data, get predictions preds = my_pipeline.predict(X_valid) # Evaluate the model score = mean_absolute_error(y_valid, preds) print('MAE:', score) # Check your answer step_1.b.check() # Preprocessing of test data, fit model preds_test = my_pipeline.predict(X_test) # Check your answer step_2.check() ``` ## Cross-Validation ```python= def get_score(n_estimators): my_pipeline = Pipeline(steps=[ ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators, random_state=0)) ]) scores = -1 * cross_val_score(my_pipeline, X, y, cv=3, scoring='neg_mean_absolute_error') return scores.mean() # Check your answer step_1.check() results = {} for i in range(1,9): results[50*i] = get_score(50*i) # Check your answer step_2.check() import matplotlib.pyplot as plt %matplotlib inline plt.plot(list(results.keys()), list(results.values())) plt.show() n_estimators_best = min(results, key=results.get) # Check your answer step_3.check() ``` ## XGBoost ```python= from xgboost import XGBRegressor # Define the model my_model_1 = XGBRegressor(random_state=0) # Fit the model my_model_1.fit(X_train, y_train) # Check your answer step_1.a.check() from sklearn.metrics import mean_absolute_error # Get predictions predictions_1 = my_model_1.predict(X_valid) # Check your answer step_1.b.check() # Calculate MAE mae_1 = mean_absolute_error(predictions_1, y_valid) print("Mean Absolute Error:" , mae_1) # Check your answer step_1.c.check() # Define the model my_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05) # Fit the model my_model_2.fit(X_train, y_train) # Get predictions predictions_2 = my_model_2.predict(X_valid) # Calculate MAE mae_2 = mean_absolute_error(predictions_2, y_valid) print("Mean Absolute Error:" , mae_2) step_2.check() # Define the model my_model_3 = XGBRegressor(n_estimators=1) # Fit the model my_model_3.fit(X_train, y_train) # Get predictions predictions_3 = my_model_3.predict(X_valid) # Calculate MAE mae_3 = mean_absolute_error(predictions_3, y_valid) print("Mean Absolute Error:" , mae_3) # Check your answer step_3.check() ``` ## A Single Neuron ```python= input_shape = [11] # Check your answer q_1.check() from tensorflow import keras from tensorflow.keras import layers # YOUR CODE HERE model = keras.Sequential([ layers.Dense(units=1, input_shape=[11]) ]) # Check your answer q_2.check() # YOUR CODE HERE w, b = model.weights # Check your answer q_3.check() ``` ## Deep Neural Networks ```python= # YOUR CODE HERE input_shape = [8] # Check your answer q_1.check() from tensorflow import keras from tensorflow.keras import layers # YOUR CODE HERE model = keras.Sequential([ layers.Dense(512, activation='relu', input_shape=input_shape), layers.Dense(512, activation='relu'), layers.Dense(512, activation='relu'), layers.Dense(1), ]) # Check your answer q_2.check() model = keras.Sequential([ layers.Dense(32, input_shape=[8]), layers.Activation('relu'), layers.Dense(32), layers.Activation('relu'), layers.Dense(1), ]) # Check your answer q_3.check() ``` ## Stochastic Gradient Descent ```python= model.compile( optimizer='adam', loss='mae' ) # Check your answer q_1.check() history = history = model.fit( X, y, batch_size=128, epochs=200 ) # Check your answer q_2.check() import pandas as pd history_df = pd.DataFrame(history.history) # Start the plot at epoch 5. You can change this to get a different view. history_df.loc[5:, ['loss']].plot(); ```