Car Sales Price Prediction

# Car Sales Price Prediction ## Libraries ``` !pip install dmba ``` ``` Requirement already satisfied: dmba in /usr/local/lib/python3.7/dist-packages (0.1.0) ``` ``` import math import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import accuracy_score, roc_curve, auc from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier import matplotlib.pylab as plt from dmba import regressionSummary, exhaustive_search from dmba import backward_elimination, forward_selection, stepwise_selection from dmba import adjusted_r2_score, AIC_score, BIC_score import statsmodels.api as sm import keras from keras import metrics from keras import regularizers from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten, Activation from keras.layers import Conv2D, MaxPooling2D #from keras.optimizers import RMSprop from tensorflow.keras.optimizers import Adam, RMSprop from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint #from keras.utils import plot_model from keras.models import load_model from tensorflow.keras.utils import to_categorical import matplotlib.gridspec as gridspec import seaborn as sns from mpl_toolkits import mplot3d from scipy import stats %matplotlib inline ``` ## Preprocessing ``` from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold import pandas_profiling as pp ``` ## Models ``` from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, RidgeCV from sklearn.svm import SVR, LinearSVR from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier import sklearn.model_selection from sklearn.model_selection import cross_val_predict as cvp from sklearn import metrics from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder, StandardScaler from scipy.stats import pearsonr # import xgboost as xgb # import lightgbm as lgb ``` ## Model Tuning ``` from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval import warnings warnings.filterwarnings("ignore") from collections import Counter from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import balanced_accuracy_score from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from xgboost import XGBRegressor from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score #from imblearn.datasets import fetch_datasets from imblearn.ensemble import BalancedBaggingClassifier from imblearn.ensemble import BalancedRandomForestClassifier from imblearn.ensemble import EasyEnsembleClassifier from imblearn.ensemble import RUSBoostClassifier from xgboost import XGBClassifier from imblearn.metrics import geometric_mean_score from numpy import mean from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold # Machine learning libraries from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report # Data fetching from pandas_datareader import data as pdr #import yfinance as yf from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() ``` ## Load Google Drive ``` from google.colab import drive drive.mount('/content/drive') ``` ``` Mounted at /content/drive ``` ## Load data from Google Drive ``` df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/DataToyotaCorolla.csv', encoding='cp1252') df_valid = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/TestingDataToyotaCorolla.csv', encoding='cp1252') ``` ### 1/ Create dummy variables Create dummy variables for the categorical predictors and rename the file into TrainingDataToyotaCorolla.csv. Provide the used references in your report ``` predictors_all = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic','Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar'] outcome = 'Price' # Partition data into predictors (x) and output (y) X = pd.get_dummies(df[predictors_all], drop_first=True) y = df[outcome] y_df = pd.DataFrame(y) y_df.head(9) ``` ``` Price 0 13500 1 13750 2 13950 3 14950 4 13750 5 12950 6 16900 7 18600 8 21500 ``` ### 2/ Multiple Linear Regression ``` df_validation = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/TestingDataToyotaCorolla.csv', encoding='cp1252') #df_validation = pd.read_csv('TestingDataToyotaCorolla.csv') predictors = ['Age_08_04', 'Automatic_airco', 'HP','KM'] outcome = 'Price' valid_X = df_validation[predictors] valid_y = df_validation[outcome] valid_X.head() ``` ``` Age_08_04 Automatic_airco HP KM 0 23 0 90 46986 1 23 0 90 72937 ``` #### 2.1/ Predict prices based on all 16 features I will replicate the steps to make sure the split train and test sets, from the CSV files imported from Google Drive, are correct to input to the new Linear Regression model using only the top 4 features results from the previous part. ``` predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic','Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar'] outcome = 'Price' ``` #### Partition data into predictors (x) and output (y) ``` X2 = pd.get_dummies(df[predictors]) y2 = df[outcome] y_df2 = pd.DataFrame(y2) X2.head() ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0 2 24 41711 90 0 3 210 1 3 0 0 0 0 0 0 0 1 0 3 26 48000 90 0 3 210 1 3 0 0 0 0 0 0 0 1 0 4 30 38500 90 0 3 210 1 3 1 0 0 1 0 0 0 1 0 ``` ``` from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X[predictors4], y, test_size=0.4, shuffle=True, random_state=1612) from sklearn.model_selection import train_test_split X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.4, shuffle=True, random_state=1612) X_train2.head() ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1 786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1 383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0 845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1 415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1 ``` ``` predictors4 = ['Age_08_04', 'KM', 'HP', 'Automatic_airco'] outcome = 'Price' df_valid.value_counts() ``` ``` Id Model Price Age_08_04 Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant Tow_Bar 1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23 10 2002 46986 Diesel 90 1 Blue 0 2000 3 4 5 210 1165 0 1 3 1 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23 10 2002 72937 Diesel 90 1 Black 0 2000 3 4 5 210 1165 0 1 3 1 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 dtype: int64 ``` ``` X_train2.head() ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1 786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1 383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0 845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1 415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1 ``` ``` X_valid = pd.get_dummies(df_valid[predictors4]) X_valid ``` ``` Age_08_04 KM HP Automatic_airco 0 23 46986 90 0 1 23 72937 90 0 ``` ``` X_valid2 = pd.get_dummies(df_valid[predictors]) X_valid2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_Diesel 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 1 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 1 ``` ``` # X_valid.drop('Fuel_Type_Diesel', inplace=True, axis=1) X_valid2.drop('Fuel_Type_Diesel', inplace=True, axis=1) # X_valid X_valid2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 ``` ``` # X_valid = X_valid.assign(Fuel_Type_CNG = [0, 0]) # X_valid = X_valid.assign(Fuel_Type_Diesel = [1, 1]) # X_valid = X_valid.assign(Fuel_Type_Petrol = [0, 0]) X_valid2 = X_valid2.assign(Fuel_Type_CNG = [0, 0]) X_valid2 = X_valid2.assign(Fuel_Type_Diesel = [1, 1]) X_valid2 = X_valid2.assign(Fuel_Type_Petrol = [0, 0]) X_valid ``` ``` Age_08_04 KM HP Automatic_airco 0 23 46986 90 0 1 23 72937 90 0 ``` ``` X_valid2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0 ``` ``` y_valid = df_valid[outcome] y_valid2 ``` ``` 0 13500 1 13750 Name: Price, dtype: int64 ``` ``` X_train ``` ``` Age_08_04 KM HP Automatic_airco 468 52 66527 110 0 786 67 75429 110 0 383 55 150000 110 0 845 61 66000 110 0 415 55 97234 110 0 ... ... ... ... ... 1212 72 86860 110 0 112 8 13253 116 1 1257 73 76151 86 0 1348 79 61165 107 0 627 65 132807 72 0 861 rows × 4 columns ``` #### 2.2/ Predict prices based on top 4 features The top 4 features are determined by a Linear Regression model prior to this step. I will add that later. ``` linear_reg = LinearRegression() linear_reg.fit(X_train, y_train) LinearRegression() X_train2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1 786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1 383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0 845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1 415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 1212 72 86860 110 0 5 85 1 3 1 0 0 1 0 0 0 0 1 112 8 13253 116 0 5 234 0 3 1 1 1 1 0 0 0 1 0 1257 73 76151 86 0 3 69 0 3 0 0 0 0 1 0 0 0 1 1348 79 61165 107 1 3 85 1 3 1 0 0 0 1 0 0 0 1 627 65 132807 72 0 5 185 1 3 1 0 0 0 1 0 0 1 0 861 rows × 17 columns ``` ``` linear_reg2 = LinearRegression() linear_reg2.fit(X_train2, y_train2) LinearRegression() Coef_Matrix = pd.DataFrame({'Predictor': X_valid.columns, 'coefficient': linear_reg.coef_}) print(Coef_Matrix) print("\nIntercept = ", linear_reg.intercept_) ``` ``` Predictor coefficient 0 Age_08_04 -139.708718 1 KM -0.011057 2 HP 26.463421 3 Automatic_airco 3319.180854 Intercept = 16472.1068808052 ``` Coef_Matrix2 = pd.DataFrame({'Predictor': X_valid2.columns, 'coefficient': linear_reg2.coef_}) print(Coef_Matrix2) print("\nIntercept = ", linear_reg2.intercept_) ``` Predictor coefficient 0 Age_08_04 -112.414337 1 KM -0.016853 2 HP 28.239217 3 Automatic 445.757345 4 Doors 96.709006 5 Quarterly_Tax 17.557193 6 Mfr_Guarantee 142.638902 7 Guarantee_Period 96.338023 8 Airco 248.941678 9 Automatic_airco 3042.190782 10 CD_Player 271.683170 11 Powered_Windows 509.603538 12 Sport_Model 306.421134 13 Tow_Bar -175.214506 14 Fuel_Type_CNG -1087.601376 15 Fuel_Type_Diesel 304.458421 16 Fuel_Type_Petrol 783.142954 Intercept = 11563.374143569199 ``` ``` X_valid2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0 ``` ``` predicted_y = linear_reg.predict(X_valid) result = pd.DataFrame({'Predicted': predicted_y, 'Actual': y_valid, 'Residual': y_valid - predicted_y}) print(result) ``` ``` Predicted Actual Residual 0 15120.974141 13500 -1620.974141 1 14834.025128 13750 -1084.025128 ``` ``` predicted_y2 = linear_reg2.predict(X_valid2) result2 = pd.DataFrame({'Predicted': predicted_y2, 'Actual': y_valid2, 'Residual': y_valid2 - predicted_y2}) print(result2) ``` ``` Predicted Actual Residual 0 15807.754337 13500 -2307.754337 1 15381.435493 13750 -1631.435493 ``` So, the Predicted Prices for the 2 Toyota Corolla cars in the Testing file using the top 4 features are 15120.97 and 14834.03 with residual values of -1621 and -1084 respectively. Meanwhile, the Predicted Prices for the 2 Toyota Corolla cars in the Testing file using the 16 features are 15807.75 and 15381.44 with residual values of -2307.75 and -1631.44 respectively. To conclude, the predicted prices using the top 4 features in Linear Regression model are closer to the actual prices. Narrowding down only to the key features predicted earlier by this model could drive a more accurate prediction in prices. ## Manual testing for Predicted results ``` linear_reg.intercept_ linear_reg.coef_[0] linear_reg.coef_[1] linear_reg.coef_[2] linear_reg.coef_[3] y_pred_1 = linear_reg.intercept_ + linear_reg.coef_[0] * X_valid[['Age_08_04']].iloc[0][0] + linear_reg.coef_[1] * X_valid[['KM']].iloc[0][0] + linear_reg.coef_[2] * X_valid[['HP']].iloc[0][0] + linear_reg.coef_[3] * X_valid[['Automatic_airco']].iloc[0][0] y_pred_1 X_valid[['Age_08_04']].iloc[1][0] X_valid[['KM']].iloc[1][0] X_valid[['HP']].iloc[1][0] X_valid[['Automatic_airco']].iloc[1][0] y_pred_2 = linear_reg.intercept_ + linear_reg.coef_[0] * X_valid[['Age_08_04']].iloc[1][0] + linear_reg.coef_[1] * X_valid[['KM']].iloc[1][0] + linear_reg.coef_[2] * X_valid[['HP']].iloc[1][0] + linear_reg.coef_[3] * X_valid[['Automatic_airco']].iloc[1][0] y_pred_2 ``` The manual test for 2 predicted prices are also correct. #### 2.3/ Regression Summary ``` # regressionSummary(y_valid, linear_reg.predict(X_valid[predictors4])) regressionSummary(y_valid2, linear_reg2.predict(X_valid2)) ``` ``` Regression statistics Mean Error (ME) : -1969.5949 Root Mean Squared Error (RMSE) : 1998.4134 Mean Absolute Error (MAE) : 1969.5949 Mean Percentage Error (MPE) : -14.4797 Mean Absolute Percentage Error (MAPE) : 14.4797 ``` This Multiple Linear Regression model using 4 key features drove lower deliverables of all errors. ### 3/ Gradient Boosting Classifier "Ensemble Learning: To obtain improved predictive efficiency than could be extracted from any of the constituent learning algorithms alone, ensemble approaches use multiple learning algorithms." Resource: https://www.askpython.com/python/examples/gradient-boosting #### 3.1/ Create Gradient Boosting models ``` gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2) gb_clf_all = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2) ``` #### 3.2/ Fit the models ``` gb_clf.fit(X_train[predictors4], y_train) GradientBoostingClassifier(learning_rate=1, max_depth=2, max_features=2, n_estimators=20) gb_clf_all.fit(X_train2, y_train2) GradientBoostingClassifier(learning_rate=1, max_depth=2, max_features=2, n_estimators=20) ``` #### 3.3/ Accuracy Scores ``` print(gb_clf.score(X_train[predictors4], y_train)) ``` ``` 0.027874564459930314 ``` ``` print(gb_clf_all.score(X_train2, y_train2)) ``` ``` 0.019744483159117306 ``` ``` print(gb_clf_all.score(X_valid2, y_valid2)) ``` ``` 0.0 ``` ``` df_valid ``` ``` Id Model Price Age_08_04 Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color ... Powered_Windows Power_Steering Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant Tow_Bar 0 1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23 10 2002 46986 Diesel 90 1 ... 1 1 0 0 0 1 0 0 0 0 1 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23 10 2002 72937 Diesel 90 1 ... 0 1 0 0 0 1 0 0 0 0 2 rows × 39 columns ``` ### For the case of top 4 features ``` df_valid[['Age_08_04', 'KM', 'HP', 'Automatic_airco']] ``` ``` Age_08_04 KM HP Automatic_airco 0 23 46986 90 0 1 23 72937 90 0 ``` #### 3.4/ Predicted & Residual Values ``` predicted_y_gb = gb_clf.predict(X_valid[predictors4]) X_valid2 ``` ``` Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol 0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0 1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0 ``` ``` predicted_y_gb2 = gb_clf_all.predict(X_valid2) result = pd.DataFrame({'Predicted': predicted_y_gb, 'Actual': y_valid, 'Residual': y_valid - predicted_y_gb}) result ``` ``` Predicted Actual Residual 0 12750 13500 750 1 12750 13750 1000 ``` ``` result2 = pd.DataFrame({'Predicted': predicted_y_gb2, 'Actual': y_valid2, 'Residual': y_valid2 - predicted_y_gb2}) result2 ``` ``` Predicted Actual Residual 0 14900 13500 -1400 1 9940 13750 3810 ``` The predicted prices for the 2 cars using the top 4 features in the Gradient Boosting are 11650 and 16950 with residual values of 1850 and -3200, respectively. However, the predicted prices for the 2 cars using the 16 features in the Gradient Boosting are 10000 and 16250 with residual values of 3500 and -2500, respectively. To conclude, the predicted prices and residual values of the Gradient Boosting Model using the top 4 features are lower than the one using all 16 features #### 3.5/ Compare the Performance Metrics ``` from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score, mean_squared_error from math import sqrt print("Mean Absolute Error MAE: " + str(mean_absolute_error(predicted_y_gb, y_valid))) print("Mean Squared Error MSE: " + str(mean_squared_error(predicted_y_gb, y_valid))) print("Root Mean Squared Error RMSE: " + str(sqrt(mean_squared_error(predicted_y_gb, y_valid)))) print("Mean Absolute Percetage Error MAPE: " + str(mean_absolute_percentage_error(predicted_y_gb, y_valid))) ``` ``` Mean Absolute Error MAE: 875.0 Mean Squared Error MSE: 781250.0 Root Mean Squared Error RMSE: 883.8834764831844 Mean Absolute Percetage Error MAPE: 0.06862745098039216 ``` ``` from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score, mean_squared_error from math import sqrt print("Mean Absolute Error MAE: " + str(mean_absolute_error(predicted_y_gb2, y_valid2))) print("Mean Squared Error MSE: " + str(mean_squared_error(predicted_y_gb2, y_valid2))) print("Root Mean Squared Error RMSE: " + str(sqrt(mean_squared_error(predicted_y_gb2, y_valid2)))) print("Mean Absolute Percetage Error MAPE: " + str(mean_absolute_percentage_error(predicted_y_gb2, y_valid2))) ``` ``` Mean Absolute Error MAE: 2605.0 Mean Squared Error MSE: 8238050.0 Root Mean Squared Error RMSE: 2870.200341439601 Mean Absolute Percetage Error MAPE: 0.23862976516819034 ``` All the MAE and RMSE of the Gradient Boosting are higher than the Multiple Linear Regression using the top 4 most impactful features. All the metrics of the Gradient Boosting using the 16 features are higher than the ones using the the top 4 features. # Conclusion The Multiple Linear Regression model using the top 4 key fields shows the best performance in terms of the most accurate predicted prices and their lowest residual prices. The Gradient Boosting has moderate results of predicted prices and residual values, which are slightly highers than the ones of the Multiple Linear Regression model using top 4 key features.