# Data Mining project 2 ###### tags: `Datamining` > ### Goal > Understand what classification systems do and the > difference between real behavior of classification > model and observed data > ### Description > Construct a classification model to observe the > difference between real ‘right’ data and modeled data --- # Step 1 - rules I fabricated this data based on nothing, only 4 positive and 4 negative rules are created, because the attributes are too many, and the rules can not conflict with each other. After data generation, I noticed the rules are not balanced due to my generation method is simply find all possible combination of attributes in each rule. |education|famlily|children|salary|country|luck|healthy|drugs|gambling|intelligence|become| |-|-|-|-|-|-|-|-|-|-|-| |bachelor,master,doctoral|rich|0,1,2,3,4|none,low,middle,high|poor,war,under develop,developed|average,good|not good,average,good|No|No|medicore,smart|rich| |bachelor,master,doctoral|middle|0,1|high|under develop,developed|bad,average,good|not good,average,good|No|No|medicore,smart|rich| |associate,bachelor,master|none|0|middle|under develop,developed|bad,average,good|average,good|No|No|medicore,smart|rich| |associate,bachelor,master,doctoral|rich|0,1,2,3,4|none,low,middle,high|poor,war,under develop,developed|good|dying|No|No|medicore,smart|rich| |below,associate,bachelor,master,doctoral|rich|0,1,2,3,4|none,low,middle,high|poor,war,under develop,developed|bad,average|not good,average|Yes|No|stupid,medicore,smart|poor| |below,associate,bachelor,master,doctoral|none,poor,middle,rich|0,1,2,3,4|none,low,middle,high|poor,war,under develop,developed|bad,average|dying|No|No|medicore,smart|poor| |bachelor,master,doctoral|none,poor,middle|1,2,3,4|none,low,middle|under develop,developed|bad,average,good|average,good|any|any|stupid,medicore,smart|poor| |below,associate,bachelor|middle|0,1,2,3,4|none,low|under develop,developed|bad,average,good|average,good|No|No|medicore,smart|poor| --- # Step 2 - my model ### Decision tree ```graphviz digraph Tree { node [shape=box, style="filled", color="black"] ; 0 [label="famlily:none,poor,middle|rich,\ngini = 0.213\nsamples = 100.0%\nvalue = [0.879, 0.121]\nclass = poor", fillcolor="#e99254"] ; 1 [label="salary:none,low,middle|high,\ngini = 0.028\nsamples = 68.7%\nvalue = [0.986, 0.014]\nclass = poor", fillcolor="#e5833c"] ; 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 2 [label="children:0|1,2,3,4,\ngini = 0.008\nsamples = 64.2%\nvalue = [0.996, 0.004]\nclass = poor", fillcolor="#e5813a"] ; 1 -> 2 ; 3 [label="healthy:dying|not good,average,good,\ngini = 0.151\nsamples = 3.0%\nvalue = [0.918, 0.082]\nclass = poor", fillcolor="#e78c4b"] ; 2 -> 3 ; 4 [label="gini = 0.0\nsamples = 2.3%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 3 -> 4 ; 5 [label="famlily:none,poor,middle,rich,\ngini = 0.46\nsamples = 0.7%\nvalue = [0.642, 0.358]\nclass = poor", fillcolor="#f4c7a8"] ; 3 -> 5 ; 6 [label="gini = 0.0\nsamples = 0.2%\nvalue = [0.0, 1.0]\nclass = rich", fillcolor="#399de5"] ; 5 -> 6 ; 7 [label="gini = 0.0\nsamples = 0.4%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 5 -> 7 ; 8 [label="gini = 0.0\nsamples = 61.2%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 2 -> 8 ; 9 [label="healthy:dying|not good,average,good,\ngini = 0.266\nsamples = 4.5%\nvalue = [0.842, 0.158]\nclass = poor", fillcolor="#ea995e"] ; 1 -> 9 ; 10 [label="gini = 0.0\nsamples = 3.8%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 9 -> 10 ; 11 [label="gini = 0.0\nsamples = 0.7%\nvalue = [0.0, 1.0]\nclass = rich", fillcolor="#399de5"] ; 9 -> 11 ; 12 [label="drugs:No|Yes,\ngini = 0.458\nsamples = 31.3%\nvalue = [0.645, 0.355]\nclass = poor", fillcolor="#f3c6a6"] ; 0 -> 12 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 13 [label="healthy:dying|not good,average,good,\ngini = 0.43\nsamples = 16.2%\nvalue = [0.313, 0.687]\nclass = rich", fillcolor="#93caf1"] ; 12 -> 13 ; 14 [label="luck:bad,average|good,\ngini = 0.41\nsamples = 7.1%\nvalue = [0.713, 0.287]\nclass = poor", fillcolor="#efb489"] ; 13 -> 14 ; 15 [label="gini = 0.0\nsamples = 5.1%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 14 -> 15 ; 16 [label="gini = 0.0\nsamples = 2.0%\nvalue = [0.0, 1.0]\nclass = rich", fillcolor="#399de5"] ; 14 -> 16 ; 17 [label="gini = 0.0\nsamples = 9.1%\nvalue = [0.0, 1.0]\nclass = rich", fillcolor="#399de5"] ; 13 -> 17 ; 18 [label="gini = 0.0\nsamples = 15.1%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 12 -> 18 ; } ``` --- ### Decision Tree with limitd height of 2 ```graphviz digraph Tree { node [shape=box, style="filled", color="black"] ; 0 [label="famlily:none,poor,middle|rich,\ngini = 0.213\nsamples = 100.0%\nvalue = [0.879, 0.121]\nclass = poor", fillcolor="#e99254"] ; 1 [label="salary:none,low,middle|high,\ngini = 0.028\nsamples = 68.7%\nvalue = [0.986, 0.014]\nclass = poor", fillcolor="#e5833c"] ; 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 2 [label="gini = 0.008\nsamples = 64.2%\nvalue = [0.996, 0.004]\nclass = poor", fillcolor="#e5813a"] ; 1 -> 2 ; 3 [label="gini = 0.266\nsamples = 4.5%\nvalue = [0.842, 0.158]\nclass = poor", fillcolor="#ea995e"] ; 1 -> 3 ; 4 [label="drugs:No|Yes,\ngini = 0.458\nsamples = 31.3%\nvalue = [0.645, 0.355]\nclass = poor", fillcolor="#f3c6a6"] ; 0 -> 4 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 5 [label="gini = 0.43\nsamples = 16.2%\nvalue = [0.313, 0.687]\nclass = rich", fillcolor="#93caf1"] ; 4 -> 5 ; 6 [label="gini = 0.0\nsamples = 15.1%\nvalue = [1.0, 0.0]\nclass = poor", fillcolor="#e58139"] ; 4 -> 6 ; } ``` --- ### MLP $Output=\ relu(Input_{\in R^{10}}\times Wh_{\in R^{10\times3}}+Bh_{\in R^{3}})\ \times Wo_{\in R^{3\times1}}+Bo_{\in R^{1}}$ ```python Wh=[[ 3.30698137e-02 9.93231857e-02 -1.87975755e-02] [ 2.38276586e+00 5.09336557e-01 7.40708291e-01] [ 4.83137138e-01 1.40158061e+00 -1.28560480e-01] [ 1.58102934e-01 -7.30234335e-01 3.88539077e-01] [-1.98011518e-01 -1.06428674e-01 -7.82481460e-02] [ 3.16568383e-03 1.08757128e-02 -1.86999401e+00] [-2.90751450e-01 -2.49305709e-01 -4.47896904e+00] [-3.13915207e+00 2.60989828e+00 2.26104008e+00] [-5.68377620e-01 2.01979476e+00 -5.08353326e-73] [ 1.39452531e-02 -1.03861545e-01 1.51969778e-01]] Bh=[[ 6.72713614] [-3.19173265] [-8.44997175]] Wo=[-3.73062445 2.17992212 1.84955227] Bo=[1.03009322] ``` --- |Model|train accuracy|test accuracy|parameters| |-|-|-|-| |Decision Tree 5|1.0|1.0|18| |Decision Tree 2|0.9397|0.9396|6| |MLP|1.0|1.0|37| --- # Step 3 - Compare The model does the perfect classification, but it did generate some unexpected decision in the process. Take an example from the 2 layer decision tree, the **health** attribute was ignored, but the original rule did have one rule of dying people most likely won't become rich. --- # Step 4 - Discuss The reason behind unexpected results can be simply put as a lack of training samples or explained as learning algorithms take statistics from data, but ignore human preference or importance of some specific decision. In other words, the bias that can't obtain from the data will more likely be ignored by the model. --- # Source Code The code contains the following * define rules * generate and save rules * generate and save data * train model * plot result and save as png * save graphviz format decision tree ### requirement * python 3 * sklearn * matplotlib ```python # In[] rules cols=[ ('education',['below','associate','bachelor','master','doctoral']), # 0-4 1 ('famlily',['none','poor','middle','rich']), # 0-3 2 ('children',list(map(str,[0,1,2,3,4])) ), # 0-4 3 ('salary',['none','low','middle','high']), # 0-3 4 ('country',['poor','war', 'under develop','developed']), # 0-3 5 ('luck',['bad','average','good']), # 0-2 6 ('healthy',['dying','not good','average','good']), # 0-3 7 ('drugs',['No','Yes']), # 0,1 8 ('gambling',['No','Yes']), # 0,1 9 ('intelligence',['stupid','medicore','smart']), # 0-2 10 ] #educa,famil,child,salar,count, luck,helth, drug,gambl, iq # #[0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ], rich=[ [(2,4), 3,(0,4),(0,3),(0,3),(1,2),(1,3), 0, 0,(1,2)], [(2,4), 2,(0,1), 3,(2,3),(0,2),(1,3), 0, 0,(1,2)], [(1,3), 0, 0, 2,(2,3),(0,2),(2,3), 0, 0,(1,2)], [(1,4), 3,(0,4),(0,3),(0,3), 2, 0, 0, 0,(1,2)], ] poor=[ [(0,4), 3,(0,4),(0,3),(0,3),(0,1),(1,2), 1, 0,(0,2)], [(0,4),(0,3),(0,4),(0,3),(0,3),(0,1), 0, 0, 0,(1,2)], [(2,4),(0,2),(1,4),(0,2),(2,3),(0,2),(2,3),(0,1),(0,1),(0,2)], [(0,2), 2,(0,4),(0,1),(2,3),(0,2),(2,3), 0, 0,(1,2)], ] # In[] generate data cvt = lambda x : [range(i[0],i[1]+1) if isinstance(i,tuple) else [i] for i in x] r2t = lambda x : [(','.join([cols[idx][1][l] for l in range(i[0],i[1]+1)]) if len(i) != len(cols[idx][1]) else 'any') if isinstance(i,tuple) else cols[idx][1][i] for i, idx in zip(x,range(len(x)))] from itertools import product def gen(p, n): g = lambda x:[j for i in map(lambda i : list(product(*cvt(i))), x) for j in i] p = g(p) n = g(n) return p+n, [1 for _ in range(len(p))] + [0 for _ in range(len(n))] open('rule.md','w').write( '|' + '|'.join(map(lambda x:x[0],cols)) +'|label|\n'+ '|-'*(len(cols)+1)+'|\n'+ '\n'.join(map(lambda x:'|'+'|'.join(x)+'|rich|' ,map(r2t , rich))) + '\n'+'\n'.join(map(lambda x:'|'+'|'.join(x)+'|poor|', map(r2t , poor))) ) x, y = gen(rich,poor) # In[] train model from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score ## In[] find subset if False: from tqdm import tqdm for rds in tqdm(range(0, 30000)): x_train, x_test, y_train, y_test= train_test_split(x,y,train_size=150,random_state=rds) model = DecisionTreeClassifier(max_depth=5, random_state=87) model.fit(x_train, y_train) if accuracy_score(y_test, model.predict(x_test)) == 1.0: print(rds) break x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=612) open('data.txt','w').write('\n'.join(map(lambda i: '{}{}'.format(['poor:','rich:'][i[1]],r2t(i[0])) , sorted(zip(x_train,y_train),key=lambda k:k[1]) ))) if True: model = DecisionTreeClassifier(max_depth=5, random_state=87) model.fit(x_train, y_train) else: from sklearn.neural_network import MLPClassifier import numpy as np model = MLPClassifier(hidden_layer_sizes=(3),random_state=5) model.fit(x_train, y_train) print('\n'.join(map(str,map(np.array, model.coefs_ + model.intercepts_)))) # In[] accuracy pred = model.predict(x_train) acc = accuracy_score(y_train, pred) print(f'acc: {acc}') pred = model.predict(x_test) acc = accuracy_score(y_test, pred) print(f'test acc: {acc}') # In[] plot from sklearn.tree import plot_tree from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(10,10),dpi=160) plot_tree(model, ax=ax, feature_names=list(map(lambda x:x[0], cols)), class_names=['poor','rich'], proportion=True, filled=True ) plt.tight_layout() plt.savefig('model.png') plt.close() # In[] graphviz from sklearn.tree import export_graphviz gviz = export_graphviz(model, feature_names=list(map(lambda x:x[0], cols)), class_names=['poor','rich'], proportion=True, filled=True, ) def replace_with_name(g): pr = [] mp = dict(cols) import re for i in re.finditer(r'\d+ \[label="(\S+) <= (\S+)\\n', g): ltxt = f'label="{i.group(1)}:' n = float(i.group(2)) ls = mp[i.group(1)] for li in range(len(ls)): if (n < li) and (li < n+1): ltxt=ltxt[:-1]+ f'|{ls[li]},' else: ltxt+=f'{ls[li]},' pr.append( (f'label="{i.group(1)} <= {i.group(2)}', ltxt) ) ret = f'{g}' for p, r in pr: ret = ret.replace(p, r) return ret g = replace_with_name(gviz) open('model.graphviz','w').write(g) # import graphviz # g = graphviz.Source(g) # g.render() ```