ML - decision trees

# ML - decision trees ###### tags: `machine learning` ``` #!/usr/bin/env python3 import mglearn import matplotlib.pyplot as plt mglearn.plots.plot_animal_tree() plt.show() ''' conda install graphviz python-graphviz https://github.com/amueller/introduction_to_ml_with_python/issues/69 (machine-learning) ycheng@nuc:~/machine-learning$ ./decision_trees.py Traceback (most recent call last): File "./decision_trees.py", line 12, in mglearn.plots.plot_animal_tree() File "/home/ycheng/anaconda3/envs/machine-learning/lib/python3.8/site-packages/mglearn/plot_animal_tree.py", line 6, in plot_animal_tree import graphviz ModuleNotFoundError: No module named 'graphviz' ``` #### output ![](https://i.imgur.com/1MPj86w.png) ### Decision tree in Breast cancel data set ``` from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42) tree = DecisionTreeClassifier(random_state=0) tree.fit(X_train, y_train) print("[ without pre-pruning (Unpruned tree) ]") print("- Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) print("- Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test))) # with pre-pruning tree = DecisionTreeClassifier(max_depth=4, random_state=0) tree.fit(X_train, y_train) print("[ with pre-pruning ]") print("- Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) print("- Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test))) # Analyzing decision trees from sklearn.tree import export_graphviz export_graphviz(tree, out_file="tree.dot", class_names=["maligant", "benign"], feature_names=cancer.feature_names, impurity=False, filled=True) import graphviz with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph).view() # "Feature importance in trees" print("- Feature importances:\n{}".format(tree.feature_importances_)) import numpy as np import matplotlib.pyplot as plt def plot_feature_importances_cancer(model): n_features = cancer.data.shape[1] plt.barh(range(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), cancer.feature_names) plt.xlabel("Feature importance") plt.ylabel("Feature") plt.tight_layout() plt.show() plot_feature_importances_cancer(tree) ``` #### output ``` [ without pre-pruning (Unpruned tree) ] - Accuracy on training set: 1.000 - Accuracy on test set: 0.937 [ with pre-pruning ] - Accuracy on training set: 0.988 - Accuracy on test set: 0.951 ``` ![](https://i.imgur.com/r2nG7BP.png) ``` - Feature importances: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01019737 0.04839825 0. 0. 0.0024156 0. 0. 0. 0. 0. 0.72682851 0.0458159 0. 0. 0.0141577 0. 0.018188 0.1221132 0.01188548 0. ] ``` ![](https://i.imgur.com/OfD4le1.png) ### Two-dimensional data set ``` import matplotlib.pyplot as plt from IPython.display import display print("[ Two-dimensional data set ]") tree = mglearn.plots.plot_tree_not_monotone() display(tree) # does not display image plt.show() # can show image plt.close() ``` #### output ``` [ Two-dimensional data set ] Feature importances: [0. 1.] ``` ![](https://i.imgur.com/VdfwJRm.png) ### DecisionTreeRegressor to ram price data set ``` ''' download ram_price.csv https://github.com/amueller/introduction_to_ml_with_python/blob/master/data/ram_price.csv ''' #### show data import pandas as pd ram_prices = pd.read_csv("data/ram_price.csv") plt.semilogy(ram_prices.date, ram_prices.price) plt.xlabel("Year") plt.ylabel("Price in $/Mbyte") plt.show() #### Comparison of predictions by linear model and DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import LinearRegression # use historical data to forecast prices after the year 2000 data_train = ram_prices[ram_prices.date < 2000] data_test = ram_prices[ram_prices.date >= 2000] # predict prices based on date X_train = data_train.date[:, np.newaxis] # we use a log-transform to get a simpler relationship of data to target y_train = np.log(data_train.price) tree = DecisionTreeRegressor().fit(X_train, y_train) linear_reg = LinearRegression().fit(X_train, y_train) # predict on all data X_all = ram_prices.date[:, np.newaxis] pred_tree = tree.predict(X_all) pred_lr = linear_reg.predict(X_all) # undo log-transform price_tree = np.exp(pred_tree) price_lr = np.exp(pred_lr) plt.semilogy(data_train.date, data_train.price, label="Training data") plt.semilogy(data_test.date, data_test.price, label="Test data") plt.semilogy(ram_prices.date, price_tree, label="Tree prediction") plt.semilogy(ram_prices.date, price_lr, label="Linear prediction") plt.legend() plt.show() plt.close() ''' The goal in this example is not to point time based data to the decision tree is not good model. It is to explain that decision tree has special characteristic in its prediction method. ''' ``` #### output ![](https://i.imgur.com/gXlikFY.png) ![](https://i.imgur.com/lIJrRbX.png) ![](https://i.imgur.com/Cquvna8.png) ![](https://i.imgur.com/CUJIqz3.png) ``` [ Random forest on Breast cancer data set ] - Accuracy on training set: 1.000 - Accuracy on test set: 0.972 [ Gradient boosted regression trees on Breast cancer data set ] - Accuracy on training set: 1.000 - Accuracy on test set: 0.965 [ Gradient boosted regression trees on Breast cancer data set, max_depth=1 ] Accuracy on training set: 0.991 Accuracy on test set: 0.972 [ Gradient boosted regression trees on Breast cancer data set, learning_rate=0.01 ] Accuracy on training set: 0.988 Accuracy on test set: 0.965 ``` ![](https://i.imgur.com/fhymdc7.png) !!! This feature importance is quite differ to original in book.