# Car Sales Price Prediction
## Libraries
```
!pip install dmba
```
```
Requirement already satisfied: dmba in /usr/local/lib/python3.7/dist-packages (0.1.0)
```
```
import math
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score
import statsmodels.api as sm
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
#from keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
#from keras.utils import plot_model
from keras.models import load_model
from tensorflow.keras.utils import to_categorical
import matplotlib.gridspec as gridspec
import seaborn as sns
from mpl_toolkits import mplot3d
from scipy import stats
%matplotlib inline
```
## Preprocessing
```
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
import pandas_profiling as pp
```
## Models
```
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, RidgeCV
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
import sklearn.model_selection
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import pearsonr
# import xgboost as xgb
# import lightgbm as lgb
```
## Model Tuning
```
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBRegressor
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
#from imblearn.datasets import fetch_datasets
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import RUSBoostClassifier
from xgboost import XGBClassifier
from imblearn.metrics import geometric_mean_score
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# Machine learning libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Data fetching
from pandas_datareader import data as pdr
#import yfinance as yf
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
```
## Load Google Drive
```
from google.colab import drive
drive.mount('/content/drive')
```
```
Mounted at /content/drive
```
## Load data from Google Drive
```
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/DataToyotaCorolla.csv', encoding='cp1252')
df_valid = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/TestingDataToyotaCorolla.csv', encoding='cp1252')
```
### 1/ Create dummy variables
Create dummy variables for the categorical predictors and rename the file into TrainingDataToyotaCorolla.csv. Provide the used references in your report
```
predictors_all = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic','Doors', 'Quarterly_Tax', 'Mfr_Guarantee',
'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']
outcome = 'Price'
# Partition data into predictors (x) and output (y)
X = pd.get_dummies(df[predictors_all], drop_first=True)
y = df[outcome]
y_df = pd.DataFrame(y)
y_df.head(9)
```
```
Price
0 13500
1 13750
2 13950
3 14950
4 13750
5 12950
6 16900
7 18600
8 21500
```
### 2/ Multiple Linear Regression
```
df_validation = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Toyota/TestingDataToyotaCorolla.csv', encoding='cp1252')
#df_validation = pd.read_csv('TestingDataToyotaCorolla.csv')
predictors = ['Age_08_04', 'Automatic_airco', 'HP','KM']
outcome = 'Price'
valid_X = df_validation[predictors]
valid_y = df_validation[outcome]
valid_X.head()
```
```
Age_08_04 Automatic_airco HP KM
0 23 0 90 46986
1 23 0 90 72937
```
#### 2.1/ Predict prices based on all 16 features
I will replicate the steps to make sure the split train and test sets, from the CSV files imported from Google Drive, are correct to input to the new Linear Regression model using only the top 4 features results from the previous part.
```
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic','Doors', 'Quarterly_Tax', 'Mfr_Guarantee',
'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']
outcome = 'Price'
```
#### Partition data into predictors (x) and output (y)
```
X2 = pd.get_dummies(df[predictors])
y2 = df[outcome]
y_df2 = pd.DataFrame(y2)
X2.head()
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0
2 24 41711 90 0 3 210 1 3 0 0 0 0 0 0 0 1 0
3 26 48000 90 0 3 210 1 3 0 0 0 0 0 0 0 1 0
4 30 38500 90 0 3 210 1 3 1 0 0 1 0 0 0 1 0
```
```
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[predictors4], y, test_size=0.4, shuffle=True, random_state=1612)
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.4, shuffle=True, random_state=1612)
X_train2.head()
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1
786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1
383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0
845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1
415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1
```
```
predictors4 = ['Age_08_04', 'KM', 'HP', 'Automatic_airco']
outcome = 'Price'
df_valid.value_counts()
```
```
Id Model Price Age_08_04 Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant Tow_Bar
1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23 10 2002 46986 Diesel 90 1 Blue 0 2000 3 4 5 210 1165 0 1 3 1 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1
2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23 10 2002 72937 Diesel 90 1 Black 0 2000 3 4 5 210 1165 0 1 3 1 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1
dtype: int64
```
```
X_train2.head()
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1
786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1
383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0
845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1
415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1
```
```
X_valid = pd.get_dummies(df_valid[predictors4])
X_valid
```
```
Age_08_04 KM HP Automatic_airco
0 23 46986 90 0
1 23 72937 90 0
```
```
X_valid2 = pd.get_dummies(df_valid[predictors])
X_valid2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_Diesel
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 1
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 1
```
```
# X_valid.drop('Fuel_Type_Diesel', inplace=True, axis=1)
X_valid2.drop('Fuel_Type_Diesel', inplace=True, axis=1)
# X_valid
X_valid2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0
```
```
# X_valid = X_valid.assign(Fuel_Type_CNG = [0, 0])
# X_valid = X_valid.assign(Fuel_Type_Diesel = [1, 1])
# X_valid = X_valid.assign(Fuel_Type_Petrol = [0, 0])
X_valid2 = X_valid2.assign(Fuel_Type_CNG = [0, 0])
X_valid2 = X_valid2.assign(Fuel_Type_Diesel = [1, 1])
X_valid2 = X_valid2.assign(Fuel_Type_Petrol = [0, 0])
X_valid
```
```
Age_08_04 KM HP Automatic_airco
0 23 46986 90 0
1 23 72937 90 0
```
```
X_valid2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0
```
```
y_valid = df_valid[outcome]
y_valid2
```
```
0 13500
1 13750
Name: Price, dtype: int64
```
```
X_train
```
```
Age_08_04 KM HP Automatic_airco
468 52 66527 110 0
786 67 75429 110 0
383 55 150000 110 0
845 61 66000 110 0
415 55 97234 110 0
... ... ... ... ...
1212 72 86860 110 0
112 8 13253 116 1
1257 73 76151 86 0
1348 79 61165 107 0
627 65 132807 72 0
861 rows × 4 columns
```
#### 2.2/ Predict prices based on top 4 features
The top 4 features are determined by a Linear Regression model prior to this step. I will add that later.
```
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
LinearRegression()
X_train2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
468 52 66527 110 0 5 85 1 3 1 0 1 1 0 1 0 0 1
786 67 75429 110 0 3 85 1 3 1 0 0 1 1 0 0 0 1
383 55 150000 110 0 3 72 0 6 1 0 0 1 0 0 1 0 0
845 61 66000 110 0 3 69 0 3 1 0 0 0 1 0 0 0 1
415 55 97234 110 0 5 85 0 3 1 0 0 1 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1212 72 86860 110 0 5 85 1 3 1 0 0 1 0 0 0 0 1
112 8 13253 116 0 5 234 0 3 1 1 1 1 0 0 0 1 0
1257 73 76151 86 0 3 69 0 3 0 0 0 0 1 0 0 0 1
1348 79 61165 107 1 3 85 1 3 1 0 0 0 1 0 0 0 1
627 65 132807 72 0 5 185 1 3 1 0 0 0 1 0 0 1 0
861 rows × 17 columns
```
```
linear_reg2 = LinearRegression()
linear_reg2.fit(X_train2, y_train2)
LinearRegression()
Coef_Matrix = pd.DataFrame({'Predictor': X_valid.columns, 'coefficient': linear_reg.coef_})
print(Coef_Matrix)
print("\nIntercept = ", linear_reg.intercept_)
```
```
Predictor coefficient
0 Age_08_04 -139.708718
1 KM -0.011057
2 HP 26.463421
3 Automatic_airco 3319.180854
Intercept = 16472.1068808052
```
Coef_Matrix2 = pd.DataFrame({'Predictor': X_valid2.columns, 'coefficient': linear_reg2.coef_})
print(Coef_Matrix2)
print("\nIntercept = ", linear_reg2.intercept_)
```
Predictor coefficient
0 Age_08_04 -112.414337
1 KM -0.016853
2 HP 28.239217
3 Automatic 445.757345
4 Doors 96.709006
5 Quarterly_Tax 17.557193
6 Mfr_Guarantee 142.638902
7 Guarantee_Period 96.338023
8 Airco 248.941678
9 Automatic_airco 3042.190782
10 CD_Player 271.683170
11 Powered_Windows 509.603538
12 Sport_Model 306.421134
13 Tow_Bar -175.214506
14 Fuel_Type_CNG -1087.601376
15 Fuel_Type_Diesel 304.458421
16 Fuel_Type_Petrol 783.142954
Intercept = 11563.374143569199
```
```
X_valid2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0
```
```
predicted_y = linear_reg.predict(X_valid)
result = pd.DataFrame({'Predicted': predicted_y, 'Actual': y_valid, 'Residual': y_valid - predicted_y})
print(result)
```
```
Predicted Actual Residual
0 15120.974141 13500 -1620.974141
1 14834.025128 13750 -1084.025128
```
```
predicted_y2 = linear_reg2.predict(X_valid2)
result2 = pd.DataFrame({'Predicted': predicted_y2, 'Actual': y_valid2, 'Residual': y_valid2 - predicted_y2})
print(result2)
```
```
Predicted Actual Residual
0 15807.754337 13500 -2307.754337
1 15381.435493 13750 -1631.435493
```
So, the Predicted Prices for the 2 Toyota Corolla cars in the Testing file using the top 4 features are 15120.97 and 14834.03 with residual values of -1621 and -1084 respectively.
Meanwhile, the Predicted Prices for the 2 Toyota Corolla cars in the Testing file using the 16 features are 15807.75 and 15381.44 with residual values of -2307.75 and -1631.44 respectively.
To conclude, the predicted prices using the top 4 features in Linear Regression model are closer to the actual prices. Narrowding down only to the key features predicted earlier by this model could drive a more accurate prediction in prices.
## Manual testing for Predicted results
```
linear_reg.intercept_
linear_reg.coef_[0]
linear_reg.coef_[1]
linear_reg.coef_[2]
linear_reg.coef_[3]
y_pred_1 = linear_reg.intercept_ + linear_reg.coef_[0] * X_valid[['Age_08_04']].iloc[0][0] + linear_reg.coef_[1] * X_valid[['KM']].iloc[0][0] + linear_reg.coef_[2] * X_valid[['HP']].iloc[0][0] + linear_reg.coef_[3] * X_valid[['Automatic_airco']].iloc[0][0]
y_pred_1
X_valid[['Age_08_04']].iloc[1][0]
X_valid[['KM']].iloc[1][0]
X_valid[['HP']].iloc[1][0]
X_valid[['Automatic_airco']].iloc[1][0]
y_pred_2 = linear_reg.intercept_ + linear_reg.coef_[0] * X_valid[['Age_08_04']].iloc[1][0] + linear_reg.coef_[1] * X_valid[['KM']].iloc[1][0] + linear_reg.coef_[2] * X_valid[['HP']].iloc[1][0] + linear_reg.coef_[3] * X_valid[['Automatic_airco']].iloc[1][0]
y_pred_2
```
The manual test for 2 predicted prices are also correct.
#### 2.3/ Regression Summary
```
# regressionSummary(y_valid, linear_reg.predict(X_valid[predictors4]))
regressionSummary(y_valid2, linear_reg2.predict(X_valid2))
```
```
Regression statistics
Mean Error (ME) : -1969.5949
Root Mean Squared Error (RMSE) : 1998.4134
Mean Absolute Error (MAE) : 1969.5949
Mean Percentage Error (MPE) : -14.4797
Mean Absolute Percentage Error (MAPE) : 14.4797
```
This Multiple Linear Regression model using 4 key features drove lower deliverables of all errors.
### 3/ Gradient Boosting Classifier
"Ensemble Learning: To obtain improved predictive efficiency than could be extracted from any of the constituent learning algorithms alone, ensemble approaches use multiple learning algorithms."
Resource: https://www.askpython.com/python/examples/gradient-boosting
#### 3.1/ Create Gradient Boosting models
```
gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2)
gb_clf_all = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2)
```
#### 3.2/ Fit the models
```
gb_clf.fit(X_train[predictors4], y_train)
GradientBoostingClassifier(learning_rate=1, max_depth=2, max_features=2,
n_estimators=20)
gb_clf_all.fit(X_train2, y_train2)
GradientBoostingClassifier(learning_rate=1, max_depth=2, max_features=2,
n_estimators=20)
```
#### 3.3/ Accuracy Scores
```
print(gb_clf.score(X_train[predictors4], y_train))
```
```
0.027874564459930314
```
```
print(gb_clf_all.score(X_train2, y_train2))
```
```
0.019744483159117306
```
```
print(gb_clf_all.score(X_valid2, y_valid2))
```
```
0.0
```
```
df_valid
```
```
Id Model Price Age_08_04 Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color ... Powered_Windows Power_Steering Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant Tow_Bar
0 1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23 10 2002 46986 Diesel 90 1 ... 1 1 0 0 0 1 0 0 0 0
1 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23 10 2002 72937 Diesel 90 1 ... 0 1 0 0 0 1 0 0 0 0
2 rows × 39 columns
```
### For the case of top 4 features
```
df_valid[['Age_08_04', 'KM', 'HP', 'Automatic_airco']]
```
```
Age_08_04 KM HP Automatic_airco
0 23 46986 90 0
1 23 72937 90 0
```
#### 3.4/ Predicted & Residual Values
```
predicted_y_gb = gb_clf.predict(X_valid[predictors4])
X_valid2
```
```
Age_08_04 KM HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar Fuel_Type_CNG Fuel_Type_Diesel Fuel_Type_Petrol
0 23 46986 90 0 3 210 0 3 0 0 0 1 0 0 0 1 0
1 23 72937 90 0 3 210 0 3 1 0 1 0 0 0 0 1 0
```
```
predicted_y_gb2 = gb_clf_all.predict(X_valid2)
result = pd.DataFrame({'Predicted': predicted_y_gb, 'Actual': y_valid, 'Residual': y_valid - predicted_y_gb})
result
```
```
Predicted Actual Residual
0 12750 13500 750
1 12750 13750 1000
```
```
result2 = pd.DataFrame({'Predicted': predicted_y_gb2, 'Actual': y_valid2, 'Residual': y_valid2 - predicted_y_gb2})
result2
```
```
Predicted Actual Residual
0 14900 13500 -1400
1 9940 13750 3810
```
The predicted prices for the 2 cars using the top 4 features in the Gradient Boosting are 11650 and 16950 with residual values of 1850 and -3200, respectively.
However, the predicted prices for the 2 cars using the 16 features in the Gradient Boosting are 10000 and 16250 with residual values of 3500 and -2500, respectively.
To conclude, the predicted prices and residual values of the Gradient Boosting Model using the top 4 features are lower than the one using all 16 features
#### 3.5/ Compare the Performance Metrics
```
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score, mean_squared_error
from math import sqrt
print("Mean Absolute Error MAE: " + str(mean_absolute_error(predicted_y_gb, y_valid)))
print("Mean Squared Error MSE: " + str(mean_squared_error(predicted_y_gb, y_valid)))
print("Root Mean Squared Error RMSE: " + str(sqrt(mean_squared_error(predicted_y_gb, y_valid))))
print("Mean Absolute Percetage Error MAPE: " + str(mean_absolute_percentage_error(predicted_y_gb, y_valid)))
```
```
Mean Absolute Error MAE: 875.0
Mean Squared Error MSE: 781250.0
Root Mean Squared Error RMSE: 883.8834764831844
Mean Absolute Percetage Error MAPE: 0.06862745098039216
```
```
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score, mean_squared_error
from math import sqrt
print("Mean Absolute Error MAE: " + str(mean_absolute_error(predicted_y_gb2, y_valid2)))
print("Mean Squared Error MSE: " + str(mean_squared_error(predicted_y_gb2, y_valid2)))
print("Root Mean Squared Error RMSE: " + str(sqrt(mean_squared_error(predicted_y_gb2, y_valid2))))
print("Mean Absolute Percetage Error MAPE: " + str(mean_absolute_percentage_error(predicted_y_gb2, y_valid2)))
```
```
Mean Absolute Error MAE: 2605.0
Mean Squared Error MSE: 8238050.0
Root Mean Squared Error RMSE: 2870.200341439601
Mean Absolute Percetage Error MAPE: 0.23862976516819034
```
All the MAE and RMSE of the Gradient Boosting are higher than the Multiple Linear Regression using the top 4 most impactful features.
All the metrics of the Gradient Boosting using the 16 features are higher than the ones using the the top 4 features.
# Conclusion
The Multiple Linear Regression model using the top 4 key fields shows the best performance in terms of the most accurate predicted prices and their lowest residual prices.
The Gradient Boosting has moderate results of predicted prices and residual values, which are slightly highers than the ones of the Multiple Linear Regression model using top 4 key features.