2019 Introduction to Machine Learning Program Assignment #1 - Naïve Bayes === :::info CS10 游騰德 0616026 ::: Objective === 1. Data Input 1. Mushroom ```python= with open('Downloads/agaricus-lepiota.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) ``` 2. Iris ```python= with open('bezdekIris.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) ``` 2. Data Visualization :::warning Please refer to [here](https://drive.google.com/open?id=1lbkhYAlIu2dbEQnpmTXoh10k97fQlKIx) to view all data visualization ::: 1. Mushroom ```python= import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics savefig = 0 with open('agaricus-lepiota.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) feature_name = ["label", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"] feature_list_all = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] feature_list_e = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] feature_list_p = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] for r in range (len(rows)-1, -1, -1): row = rows[r] if row[11] is '?': del rows[r] data_count = len(rows) for r in range (0, data_count): row = rows[r] label = row[0] if label in feature_list_all[0]: feature_list_all[0][label]+=1 else: feature_list_all[0][label] = 1 for i in range (1, 23): feature = row[i] if feature in feature_list_all[i]: feature_list_all[i][feature]+=1 else: feature_list_all[i][feature] = 1 if label is 'e': if feature in feature_list_e[i]: feature_list_e[i][feature]+=1 else: feature_list_e[i][feature] = 1 elif label is 'p': if feature in feature_list_p[i]: feature_list_p[i][feature]+=1 else: feature_list_p[i][feature] = 1 for f in range(24): if f is 0: keys = feature_list_all[f].keys() vals = feature_list_all[f].values() plt.bar(keys, np.divide(list(vals), sum(vals)), label=feature_name[f]+" distribution") plt.ylim(0,1) plt.ylabel ('Percentage') plt.xlabel (feature_name[f]) plt.xticks(list(keys)) plt.legend (bbox_to_anchor=(1, 1), loc="upper right", borderaxespad=0.) if savefig: plt.savefig("Visualization/Mushroom/"+feature_name[f]+"_distribution.png") plt.show() else: keys = feature_list_all[f].keys() vals = feature_list_all[f].values() plt.bar(keys, np.divide(list(vals), sum(vals)), label=feature_name[f]+" distribution") plt.ylim(0,1) plt.ylabel ('Percentage') plt.xlabel (feature_name[f]) plt.xticks(list(keys)) plt.legend (bbox_to_anchor=(1, 1), loc="upper right", borderaxespad=0.) if savefig: plt.savefig("Visualization/Mushroom/"+feature_name[f]+"_distribution.png") plt.show() keys = feature_list_e[f].keys() vals = feature_list_e[f].values() plt.bar(keys, np.divide(list(vals), sum(vals)), label=feature_name[f]+"|edible distribution") plt.ylim(0,1) plt.ylabel ('Percentage') plt.xlabel (feature_name[f]) plt.xticks(list(keys)) plt.legend (bbox_to_anchor=(1, 1), loc="upper right", borderaxespad=0.) if savefig: plt.savefig("Visualization/Mushroom/"+feature_name[f]+"|edible_distribution.png") plt.show() keys = feature_list_p[f].keys() vals = feature_list_p[f].values() plt.bar(keys, np.divide(list(vals), sum(vals)), label=feature_name[f]+"|poisionous distribution") plt.ylim(0,1) plt.ylabel ('Percentage') plt.xlabel (feature_name[f]) plt.xticks(list(keys)) plt.legend (bbox_to_anchor=(1, 1), loc="upper right", borderaxespad=0.) if savefig: plt.savefig("Visualization/Mushroom/"+feature_name[f]+"|poisionous_distribution.png") plt.show() ``` 2. Iris ```python= import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics savefig = 0 def GD(x, mu, sigma): N = 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) ) return(N) with open('bezdekIris.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) feature = [ [ [], [], [], [] ], [ [], [], [], [] ], [ [], [], [], [] ], [ [], [], [], [] ] ] for r in range (len(rows)-1, -1, -1): row = rows[r] if len(row) is 0: del rows[r] data_count = len(rows) for r in range (0, data_count): row = rows[r] iris = 0 if row[4] == 'Iris-setosa': iris = 0 elif row[4] == 'Iris-versicolor': iris = 1 elif row[4] == 'Iris-virginica': iris = 2 for f in range(4): feature[f][iris].append(float(row[f])) feature[f][3].append(float(row[f])) feature_name = ["sepal length", "sepal width", "petal length", "petal width"] condition_name = ["|setosa", "|versicolor", "|virginica", ""] for f in range(4): for c in range(4): print("Average of", feature_name[f]+condition_name[c], statistics.mean(feature[f][c])) print("Standard deviation of", feature_name[f]+condition_name[c], statistics.stdev(feature[f][c])) plt.hist(feature[f][c]) plt.title(feature_name[f]+condition_name[c]) plt.ylabel ('Count') plt.xlabel ("Value") if savefig: plt.savefig("Visualization/Iris/"+feature_name[f]+condition_name[c]+" distribution.png") plt.show() ``` Text Output of this python code: ``` Average of sepal length|setosa 5.006 Standard deviation of sepal length|setosa 0.3524896872134513 Average of sepal length|versicolor 5.936 Standard deviation of sepal length|versicolor 0.5161711470638634 Average of sepal length|virginica 6.588 Standard deviation of sepal length|virginica 0.6358795932744321 Average of sepal length 5.843333333333334 Standard deviation of sepal length 0.8280661279778629 Average of sepal width|setosa 3.428 Standard deviation of sepal width|setosa 0.37906436909628866 Average of sepal width|versicolor 2.77 Standard deviation of sepal width|versicolor 0.3137983233784114 Average of sepal width|virginica 2.974 Standard deviation of sepal width|virginica 0.32249663817263746 Average of sepal width 3.0573333333333332 Standard deviation of sepal width 0.4358662849366982 Average of petal length|setosa 1.462 Standard deviation of petal length|setosa 0.17366399648018407 Average of petal length|versicolor 4.26 Standard deviation of petal length|versicolor 0.46991097723995795 Average of petal length|virginica 5.552 Standard deviation of petal length|virginica 0.5518946956639834 Average of petal length 3.758 Standard deviation of petal length 1.7652982332594664 Average of petal width|setosa 0.246 Standard deviation of petal width|setosa 0.10538558938004565 Average of petal width|versicolor 1.326 Standard deviation of petal width|versicolor 0.19775268000454405 Average of petal width|virginica 2.026 Standard deviation of petal width|virginica 0.27465005563666733 Average of petal width 1.1993333333333334 Standard deviation of petal width 0.7622376689603465 ``` 3. Data processing - Drop features with any missing value 1. Mushroom ```python= for r in range (len(rows)-1, -1, -1): row = rows[r] if row[11] is '?': del rows[r] ``` 2. Iris ```python= for r in range (len(rows)-1, -1, -1): row = rows[r] if len(row) is 0: del rows[r] ``` - Shuffle ```python= random.shuffle(rows) ``` 4. Model Construction 1. Mushroom ```python= for i in range (1, 23): if 'e'+row[i] in feature_list[i]: p_edible += math.log( ( feature_list[i]['e'+row[i]] + k ) / ( feature_list[0]['e'] + (k*feature_count[i]) ) ) elif k is not 0: p_edible += math.log(1/feature_count[i]) if 'p'+row[i] in feature_list[i]: p_poisonous += math.log( ( feature_list[i]['p'+row[i]] + k ) / ( feature_list[0]['p']+ (k*feature_count[i]) ) ) elif k is not 0: p_poisonous += math.log(1/feature_count[i]) p_edible += math.log( feature_list[0]['e'] / (feature_list[0]['e'] + feature_list[0]['p']) ) p_poisonous += math.log( feature_list[0]['p'] / (feature_list[0]['e'] + feature_list[0]['p']) ) ``` 2. Iris ```python= p_setosa += math.log( GD(float(row[0]), statistics.mean(sl[0]), statistics.stdev(sl[0])) ) p_setosa += math.log( GD(float(row[1]), statistics.mean(sw[0]), statistics.stdev(sw[0])) ) p_setosa += math.log( GD(float(row[2]), statistics.mean(pl[0]), statistics.stdev(pl[0])) ) p_setosa += math.log( GD(float(row[3]), statistics.mean(pw[0]), statistics.stdev(pw[0])) ) p_setosa += math.log( setosa / (setosa+versicolor+virginica) ) p_versicolor += math.log( GD(float(row[0]), statistics.mean(sl[1]), statistics.stdev(sl[1])) ) p_versicolor += math.log( GD(float(row[1]), statistics.mean(sw[1]), statistics.stdev(sw[1])) ) p_versicolor += math.log( GD(float(row[2]), statistics.mean(pl[1]), statistics.stdev(pl[1])) ) p_versicolor += math.log( GD(float(row[3]), statistics.mean(pw[1]), statistics.stdev(pw[1])) ) p_versicolor += math.log( versicolor / (setosa+versicolor+virginica) ) p_virginica += math.log( GD(float(row[0]), statistics.mean(sl[2]), statistics.stdev(sl[2])) ) p_virginica += math.log( GD(float(row[1]), statistics.mean(sw[2]), statistics.stdev(sw[2])) ) p_virginica += math.log( GD(float(row[2]), statistics.mean(pl[2]), statistics.stdev(pl[2])) ) p_virginica += math.log( GD(float(row[3]), statistics.mean(pw[2]), statistics.stdev(pw[2])) ) p_virginica += math.log( virginica / (setosa+versicolor+virginica) ) ``` 5. Train-Test-Split - Holdout validation - K-fold cross-validation 7. Results - Holdout validation 1. Mushroom Result ``` Holdout Validation -------------------------------------------------- Confusion matrix Predict Edible Predict Poisonous Actual Edible 1043 9 Actual Poisonous 222 419 -------------------------------------------------- Accuracy 0.863555818074424 Precision (Edible) 0.9914448669201521 Precision (Poisonous) 0.6536661466458659 Recall (Edible) 0.824505928853755 Recall (Poisonous) 0.9789719626168224 -------------------------------------------------- data_count 5644 train_data_count 3951 test_data_count 1693 Laplace k 3 ``` 2. Iris Result ``` Holdout validation ----------------------------------------------------------------------- Confusion matrix Predict setosa Predict versicolor Predict virginica Actual setosa 13 0 0 Actual versicolor 0 15 0 Actual virginica 0 1 16 ----------------------------------------------------------------------- Accuracy 0.9777777777777777 Precision (setosa) 1.0 Precision (versicolor) 1.0 Precision (virginica) 0.9411764705882353 Recall (setosa) 0.9285714285714286 Recall (versicolor) 0.9375 Recall (virginica) 1.0 ----------------------------------------------------------------------- data_count 150 train_data_count 105 test_data_count 45 ``` - K-fold cross-validation 1. Mushroom Result ``` K-fold cross-validation -------------------------------------------------- Confusion matrix Predict Edible Predict Poisonous Actual Edible 1156.6666666666667 6 Actual Poisonous 251.33333333333334 467 -------------------------------------------------- Accuracy 0.8631933368775474 Precision (Edible) 0.9948394495412844 Precision (Poisonous) 0.6501160092807424 Recall (Edible) 0.8214962121212122 Recall (Poisonous) 0.9873150105708245 -------------------------------------------------- data_count 5644 train_data_count (avg) 3763 test_data_count (avg) 1881 Laplace k 3 ``` 2. Iris Result ``` K-fold cross-validation ----------------------------------------------------------------------- Confusion matrix Predict setosa Predict versicolor Predict virginica Actual setosa 16.666666666666668 0 0 Actual versicolor 0 15.333333333333334 1.3333333333333333 Actual virginica 0 1.3333333333333333 15.333333333333334 ----------------------------------------------------------------------- Accuracy 0.9466666666666665 Precision (setosa) 1.0 Precision (versicolor) 0.9199999999999999 Precision (virginica) 0.9199999999999999 Recall (setosa) 0.8620689655172415 Recall (versicolor) 0.9199999999999999 Recall (virginica) 0.9199999999999999 ----------------------------------------------------------------------- data_count 150 train_data_count (avg) 100 test_data_count (avg) 50 ``` 7. Comparison & Conclusion 1. Mushroom - With Laplace Smoothing ``` K-fold cross-validation -------------------------------------------------- Confusion matrix Predict Edible Predict Poisonous Actual Edible 1156.6666666666667 6 Actual Poisonous 251.33333333333334 467 -------------------------------------------------- Accuracy 0.8631933368775474 Precision (Edible) 0.9948394495412844 Precision (Poisonous) 0.6501160092807424 Recall (Edible) 0.8214962121212122 Recall (Poisonous) 0.9873150105708245 -------------------------------------------------- data_count 5644 train_data_count (avg) 3763 test_data_count (avg) 1881 Laplace k 3 ``` - Without Laplace Smoothing ``` Holdout Validation -------------------------------------------------- Confusion matrix Predict Edible Predict Poisonous Actual Edible 984 44 Actual Poisonous 302 363 -------------------------------------------------- Accuracy 0.7956290608387477 Precision (Edible) 0.9571984435797666 Precision (Poisonous) 0.5458646616541354 Recall (Edible) 0.7651632970451011 Recall (Poisonous) 0.8918918918918919 -------------------------------------------------- data_count 5644 train_data_count 3951 test_data_count 1693 Laplace k 0 ``` - Concluson: Laplace smoothing increase the accuracy effciently in the dataset case. Codes === Mushroom dataset with Holdout validation -- ```python= #Holdout validation import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics feature_count = [2, 6, 4, 10, 2, 9, 4, 3, 2, 12, 2, 6, 4, 4, 9, 9, 2, 4, 3, 8, 9, 6, 7] k = 3 with open('Downloads/agaricus-lepiota.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) random.shuffle(rows) feature_list = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] data_count = 0 train_data_count = 0 test_data_count = 0 TE = 0 FE = 0 FP = 0 TP = 0 for r in range (len(rows)-1, -1, -1): row = rows[r] if row[11] is '?': del rows[r] data_count = len(rows) random.shuffle(rows) test_range_s = 0 test_range_e = test_range_s + int(data_count * 0.3) # Train for r in range (0, data_count): if r in range (test_range_s, test_range_e): continue row = rows[r] label = row[0] if label in feature_list[0]: feature_list[0][label]+=1 else: feature_list[0][label] = 1 for i in range (1, 23): label_feature = label + row[i] if label_feature in feature_list[i]: feature_list[i][label_feature]+=1 else: feature_list[i][label_feature] = 1 train_data_count+=1 # Test for r in range (test_range_s, test_range_e): row = rows[r] p_edible = 0 p_poisonous = 0 for i in range (1, 23): if 'e'+row[i] in feature_list[i]: p_edible += math.log( ( feature_list[i]['e'+row[i]] + k ) / ( feature_list[0]['e'] + (k*feature_count[i]) ) ) elif k is not 0: p_edible += math.log(1/feature_count[i]) if 'p'+row[i] in feature_list[i]: p_poisonous += math.log( ( feature_list[i]['p'+row[i]] + k ) / ( feature_list[0]['p']+ (k*feature_count[i]) ) ) elif k is not 0: p_poisonous += math.log(1/feature_count[i]) p_edible += math.log( feature_list[0]['e'] / (feature_list[0]['e'] + feature_list[0]['p']) ) p_poisonous += math.log( feature_list[0]['p'] / (feature_list[0]['e'] + feature_list[0]['p']) ) if p_edible >= p_poisonous: # Predict Edible if row[0] is 'e': TE+=1 elif row[0] is 'p': FP+=1 else: # Predict Poisonous if row[0] is 'e': FE+=1 elif row[0] is 'p': TP+=1 test_data_count+=1 # Results print("Holdout Validation") print("--------------------------------------------------") print("Confusion matrix") print(" Predict Edible Predict Poisonous") print("Actual Edible ", TE, " ", FE) print("Actual Poisonous ", FP, " ", TP) print("--------------------------------------------------") print("Accuracy ", (TE+TP) / (TE+TP+FE+FP)) print("Precision (Edible) ", (TE) / (TE+FE)) print("Precision (Poisonous) ", (TP) / (TP+FP)) print("Recall (Edible) ", (TE) / (TE+FP)) print("Recall (Poisonous) ", (TP) / (TP+FE)) print("--------------------------------------------------") print("data_count ", data_count) print("train_data_count ", train_data_count) print("test_data_count ", test_data_count) print("Laplace k ", k) ### # | Predict | Predict # | Edible | Poisonous #-------------------+------------+------------- # Actual Edible | TE | FE #-------------------+------------+------------- # Actual Poisonous | FP | TP # ### ``` Mushroom dataset with K-fold cross-validation -- ```python= #K-fold cross-validation import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics feature_count = [2, 6, 4, 10, 2, 9, 4, 3, 2, 12, 2, 6, 4, 4, 9, 9, 2, 4, 3, 8, 9, 6, 7] k = 3 with open('Downloads/agaricus-lepiota.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) random.shuffle(rows) feature_list = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] data_count = 0 train_data_count = [0, 0, 0] test_data_count = [0, 0, 0] TE = [0, 0, 0] FE = [0, 0, 0] FP = [0, 0, 0] TP = [0, 0, 0] for r in range (len(rows)-1, -1, -1): row = rows[r] if row[11] is '?': del rows[r] data_count = len(rows) test_range_e = 0 k_ford_k = 3 for k_ford_i in range(k_ford_k): test_range_s = test_range_e test_range_e = test_range_s + int(data_count / k_ford_k) test_range_e = min(test_range_e, data_count) # Train for r in range (0, data_count): if r in range (test_range_s, test_range_e): continue row = rows[r] label = row[0] if label in feature_list[0]: feature_list[0][label]+=1 else: feature_list[0][label] = 1 for i in range (1, 23): label_feature = label + row[i] if label_feature in feature_list[i]: feature_list[i][label_feature]+=1 else: feature_list[i][label_feature] = 1 train_data_count[k_ford_i]+=1 # Test for r in range (test_range_s, test_range_e): row = rows[r] p_edible = 0 p_poisonous = 0 for i in range (1, 23): if 'e'+row[i] in feature_list[i]: p_edible += math.log( ( feature_list[i]['e'+row[i]] + k ) / ( feature_list[0]['e'] + (k*feature_count[i]) ) ) elif k is not 0: p_edible += math.log(1/feature_count[i]) if 'p'+row[i] in feature_list[i]: p_poisonous += math.log( ( feature_list[i]['p'+row[i]] + k ) / ( feature_list[0]['p']+ (k*feature_count[i]) ) ) elif k is not 0: p_poisonous += math.log(1/feature_count[i]) p_edible += math.log( feature_list[0]['e'] / (feature_list[0]['e'] + feature_list[0]['p']) ) p_poisonous += math.log( feature_list[0]['p'] / (feature_list[0]['e'] + feature_list[0]['p']) ) if p_edible >= p_poisonous: # Predict Edible if row[0] is 'e': TE[k_ford_i]+=1 elif row[0] is 'p': FP[k_ford_i]+=1 else: # Predict Poisonous if row[0] is 'e': FE[k_ford_i]+=1 elif row[0] is 'p': TP[k_ford_i]+=1 test_data_count[k_ford_i]+=1 # Results print("K-fold cross-validation") print("--------------------------------------------------") print("Confusion matrix") print(" Predict Edible Predict Poisonous") print("Actual Edible ", statistics.mean(TE), " ", statistics.mean(FE)) print("Actual Poisonous ", statistics.mean(FP), " ", statistics.mean(TP)) print("--------------------------------------------------") print("Accuracy ", (statistics.mean(TE)+statistics.mean(TP)) / (statistics.mean(TE)+statistics.mean(TP)+statistics.mean(FE)+statistics.mean(FP))) print("Precision (Edible) ", (statistics.mean(TE)) / (statistics.mean(TE)+statistics.mean(FE))) print("Precision (Poisonous) ", (statistics.mean(TP)) / (statistics.mean(TP)+statistics.mean(FP))) print("Recall (Edible) ", (statistics.mean(TE)) / (statistics.mean(TE)+statistics.mean(FP))) print("Recall (Poisonous) ", (statistics.mean(TP)) / (statistics.mean(TP)+statistics.mean(FE))) print("--------------------------------------------------") print("data_count ", data_count) print("train_data_count (avg) ", statistics.mean(train_data_count)) print("test_data_count (avg) ", statistics.mean(test_data_count)) print("Laplace k ", k) ### # | Predict | Predict # | Edible | Poisonous #-------------------+------------+------------- # Actual Edible | TE | FE #-------------------+------------+------------- # Actual Poisonous | FP | TP # ### ``` Iris dataset with Holdout validation -- ```python= #Holdout validation import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics def GD(x, mu, sigma): N = 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) ) return(N) with open('bezdekIris.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) random.shuffle(rows) sl = [ [], [], [] ] sw = [ [], [], [] ] pl = [ [], [], [] ] pw = [ [], [], [] ] data_count = 0 train_data_count = 0 test_data_count = 0 setosa = 0 versicolor = 0 virginica = 0 TS = 0 FSC = 0 FSG = 0 TG = 0 FGS = 0 FGC = 0 TC = 0 FCS = 0 FCG = 0 for r in range (len(rows)-1, -1, -1): row = rows[r] if len(row) is 0: del rows[r] data_count = len(rows) random.shuffle(rows) test_range_s = 0 test_range_e = test_range_s + int(data_count * 0.3) # Train for r in range (0, data_count): if r in range (test_range_s, test_range_e): continue row = rows[r] iris = 0 if row[4] == 'Iris-setosa': iris = 0 setosa+=1 elif row[4] == 'Iris-versicolor': iris = 1 versicolor+=1 elif row[4] == 'Iris-virginica': iris = 2 virginica+=1 sl[iris].append(float(row[0])) sw[iris].append(float(row[1])) pl[iris].append(float(row[2])) pw[iris].append(float(row[3])) train_data_count+=1 # Test for r in range(test_range_s, test_range_e): row = rows[r] p_setosa = 0 p_versicolor = 0 p_virginica = 0 p_setosa += math.log( GD(float(row[0]), statistics.mean(sl[0]), statistics.stdev(sl[0])) ) p_setosa += math.log( GD(float(row[1]), statistics.mean(sw[0]), statistics.stdev(sw[0])) ) p_setosa += math.log( GD(float(row[2]), statistics.mean(pl[0]), statistics.stdev(pl[0])) ) p_setosa += math.log( GD(float(row[3]), statistics.mean(pw[0]), statistics.stdev(pw[0])) ) p_setosa += math.log( setosa / (setosa+versicolor+virginica) ) p_versicolor += math.log( GD(float(row[0]), statistics.mean(sl[1]), statistics.stdev(sl[1])) ) p_versicolor += math.log( GD(float(row[1]), statistics.mean(sw[1]), statistics.stdev(sw[1])) ) p_versicolor += math.log( GD(float(row[2]), statistics.mean(pl[1]), statistics.stdev(pl[1])) ) p_versicolor += math.log( GD(float(row[3]), statistics.mean(pw[1]), statistics.stdev(pw[1])) ) p_versicolor += math.log( versicolor / (setosa+versicolor+virginica) ) p_virginica += math.log( GD(float(row[0]), statistics.mean(sl[2]), statistics.stdev(sl[2])) ) p_virginica += math.log( GD(float(row[1]), statistics.mean(sw[2]), statistics.stdev(sw[2])) ) p_virginica += math.log( GD(float(row[2]), statistics.mean(pl[2]), statistics.stdev(pl[2])) ) p_virginica += math.log( GD(float(row[3]), statistics.mean(pw[2]), statistics.stdev(pw[2])) ) p_virginica += math.log( virginica / (setosa+versicolor+virginica) ) max_p = max(p_setosa, p_versicolor, p_virginica) if max_p == p_setosa: # Predict setosa if row[4] == 'Iris-setosa': TS+=1 elif row[4] == 'Iris-versicolor': FSC+=1 elif row[4] == 'Iris-virginica': FSG+=1 elif max_p == p_versicolor: # Predict versicolor if row[4] == 'Iris-setosa': FCS+=1 elif row[4] == 'Iris-versicolor': TC+=1 elif row[4] == 'Iris-virginica': FCG+=1 elif max_p == p_virginica: # Predict virginica if row[4] == 'Iris-setosa': FGS+=1 elif row[4] == 'Iris-versicolor': FGC+=1 elif row[4] == 'Iris-virginica': TG+=1 test_data_count+=1 # Results print("Holdout validation") print("-----------------------------------------------------------------------") print("Confusion matrix") print(" Predict setosa Predict versicolor Predict virginica") print("Actual setosa ", TS, " ", FSC, " ", FSG) print("Actual versicolor ", FCS, " ", TC, " ", FCG) print("Actual virginica ", FGS, " ", FGC, " ", TG) print("-----------------------------------------------------------------------") print("Accuracy ", (TS+TC+TG) / (TS+TC+TG+FSC+FSG+FCS+FCG+FGS+FGC)) print("Precision (setosa) ", (TS) / (TS+FSC+FSG)) print("Precision (versicolor) ", (TC) / (TC+FCS+FCG)) print("Precision (virginica) ", (TG) / (TG+FGS+FGC)) print("Recall (setosa) ", (TS) / (TS+FCS+FCG+FGS+FGC)) print("Recall (versicolor) ", (TC) / (TC+FSC+FSG+FGS+FGC)) print("Recall (virginica) ", (TG) / (TG+FSC+FSG+FCS+FCG)) print("-----------------------------------------------------------------------") print("data_count ", data_count) print("train_data_count ", train_data_count) print("test_data_count ", test_data_count) ### # | Predict | Predict | Predict # | setosa | versicolor | virginica #--------------------+------------+-------------+------------- # Actual setosa | TS | FSC | FSG #--------------------+------------+-------------+------------- # Actual versicolor | FCS | TC | FCG #--------------------+------------+-------------+------------- # Actual virginica | FGS | FGC | TG # ### ``` Iris dataset with K-fold cross-validation -- ```python= #K-fold cross-validation import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy import stats import csv import math import random import statistics def GD(x, mu, sigma): N = 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) ) return(N) with open('bezdekIris.data', newline='') as csvfile: rows = list(csv.reader(csvfile)) random.shuffle(rows) data_count = 0 train_data_count = [0, 0, 0] test_data_count = [0, 0, 0] setosa = [0, 0, 0] versicolor = [0, 0, 0] virginica = [0, 0, 0] TS = [0, 0, 0] FSC = [0, 0, 0] FSG = [0, 0, 0] TG = [0, 0, 0] FGS = [0, 0, 0] FGC = [0, 0, 0] TC = [0, 0, 0] FCS = [0, 0, 0] FCG = [0, 0, 0] for r in range (len(rows)-1, -1, -1): row = rows[r] if len(row) is 0: del rows[r] data_count = len(rows) test_range_e = 0 k_ford_k = 3 for k_ford_i in range(k_ford_k): test_range_s = test_range_e test_range_e = test_range_s + int(data_count / k_ford_k) test_range_e = min(test_range_e, data_count) sl = [ [], [], [] ] sw = [ [], [], [] ] pl = [ [], [], [] ] pw = [ [], [], [] ] # Train for r in range (0, data_count): if r in range (test_range_s, test_range_e): continue row = rows[r] iris = 0 if row[4] == 'Iris-setosa': iris = 0 setosa[k_ford_i]+=1 elif row[4] == 'Iris-versicolor': iris = 1 versicolor[k_ford_i]+=1 elif row[4] == 'Iris-virginica': iris = 2 virginica[k_ford_i]+=1 sl[iris].append(float(row[0])) sw[iris].append(float(row[1])) pl[iris].append(float(row[2])) pw[iris].append(float(row[3])) train_data_count[k_ford_i]+=1 # Test for r in range(test_range_s, test_range_e): row = rows[r] p_setosa = 0 p_versicolor = 0 p_virginica = 0 p_setosa += math.log( GD(float(row[0]), statistics.mean(sl[0]), statistics.stdev(sl[0])) ) p_setosa += math.log( GD(float(row[1]), statistics.mean(sw[0]), statistics.stdev(sw[0])) ) p_setosa += math.log( GD(float(row[2]), statistics.mean(pl[0]), statistics.stdev(pl[0])) ) p_setosa += math.log( GD(float(row[3]), statistics.mean(pw[0]), statistics.stdev(pw[0])) ) p_setosa += math.log( setosa[k_ford_i] / (setosa[k_ford_i]+versicolor[k_ford_i]+virginica[k_ford_i]) ) p_versicolor += math.log( GD(float(row[0]), statistics.mean(sl[1]), statistics.stdev(sl[1])) ) p_versicolor += math.log( GD(float(row[1]), statistics.mean(sw[1]), statistics.stdev(sw[1])) ) p_versicolor += math.log( GD(float(row[2]), statistics.mean(pl[1]), statistics.stdev(pl[1])) ) p_versicolor += math.log( GD(float(row[3]), statistics.mean(pw[1]), statistics.stdev(pw[1])) ) p_versicolor += math.log( versicolor[k_ford_i] / (setosa[k_ford_i]+versicolor[k_ford_i]+virginica[k_ford_i]) ) p_virginica += math.log( GD(float(row[0]), statistics.mean(sl[2]), statistics.stdev(sl[2])) ) p_virginica += math.log( GD(float(row[1]), statistics.mean(sw[2]), statistics.stdev(sw[2])) ) p_virginica += math.log( GD(float(row[2]), statistics.mean(pl[2]), statistics.stdev(pl[2])) ) p_virginica += math.log( GD(float(row[3]), statistics.mean(pw[2]), statistics.stdev(pw[2])) ) p_virginica += math.log( virginica[k_ford_i] / (setosa[k_ford_i]+versicolor[k_ford_i]+virginica[k_ford_i]) ) max_p = max(p_setosa, p_versicolor, p_virginica) if max_p == p_setosa: # Predict setosa if row[4] == 'Iris-setosa': TS[k_ford_i]+=1 elif row[4] == 'Iris-versicolor': FSC[k_ford_i]+=1 elif row[4] == 'Iris-virginica': FSG[k_ford_i]+=1 elif max_p == p_versicolor: # Predict versicolor if row[4] == 'Iris-setosa': FCS[k_ford_i]+=1 elif row[4] == 'Iris-versicolor': TC[k_ford_i]+=1 elif row[4] == 'Iris-virginica': FCG[k_ford_i]+=1 elif max_p == p_virginica: # Predict virginica if row[4] == 'Iris-setosa': FGS[k_ford_i]+=1 elif row[4] == 'Iris-versicolor': FGC[k_ford_i]+=1 elif row[4] == 'Iris-virginica': TG[k_ford_i]+=1 test_data_count[k_ford_i]+=1 # Results print("K-fold cross-validation") print("-----------------------------------------------------------------------") print("Confusion matrix") print(" Predict setosa Predict versicolor Predict virginica") print("Actual setosa ", statistics.mean(TS), " ", statistics.mean(FSC), " ", statistics.mean(FSG)) print("Actual versicolor ", statistics.mean(FGS), " ", statistics.mean(TC), " ", statistics.mean(FCG)) print("Actual virginica ", statistics.mean(FGS), " ", statistics.mean(FGC), " ", statistics.mean(TG)) print("-----------------------------------------------------------------------") print("Accuracy ", (statistics.mean(TS)+statistics.mean(TC)+statistics.mean(TG)) / (statistics.mean(TS)+statistics.mean(TC)+statistics.mean(TG)+statistics.mean(FSC)+statistics.mean(FSG)+statistics.mean(FGS)+statistics.mean(FCG)+statistics.mean(FGS)+statistics.mean(FGC))) print("Precision (setosa) ", statistics.mean(TS) / (statistics.mean(TS)+statistics.mean(FSC)+statistics.mean(FSG))) print("Precision (versicolor) ", statistics.mean(TC) / (statistics.mean(TC)+statistics.mean(FGS)+statistics.mean(FCG))) print("Precision (virginica) ", statistics.mean(TG) / (statistics.mean(TG)+statistics.mean(FGS)+statistics.mean(FGC))) print("Recall (setosa) ", statistics.mean(TS) / (statistics.mean(TS)+statistics.mean(FGS)+statistics.mean(FCG)+statistics.mean(FGS)+statistics.mean(FGC))) print("Recall (versicolor) ", statistics.mean(TC) / (statistics.mean(TC)+statistics.mean(FSC)+statistics.mean(FSG)+statistics.mean(FGS)+statistics.mean(FGC))) print("Recall (virginica) ", statistics.mean(TG) / (statistics.mean(TG)+statistics.mean(FSC)+statistics.mean(FSG)+statistics.mean(FGS)+statistics.mean(FCG))) print("-----------------------------------------------------------------------") print("data_count ", data_count) print("train_data_count (avg) ", statistics.mean(train_data_count)) print("test_data_count (avg) ", statistics.mean(test_data_count)) ### # | Predict | Predict | Predict # | setosa | versicolor | virginica #--------------------+------------+-------------+------------- # Actual setosa | TS | FSC | FSG #--------------------+------------+-------------+------------- # Actual versicolor | FCS | TC | FCG #--------------------+------------+-------------+------------- # Actual virginica | FGS | FGC | TG # ### ``` ###### tags: `NCTU` `ML` `Naïve Bayes`