# ML - decision trees
###### tags: `machine learning`
```
#!/usr/bin/env python3
import mglearn
import matplotlib.pyplot as plt
mglearn.plots.plot_animal_tree()
plt.show()
'''
conda install graphviz python-graphviz
https://github.com/amueller/introduction_to_ml_with_python/issues/69
(machine-learning) ycheng@nuc:~/machine-learning$ ./decision_trees.py
Traceback (most recent call last):
File "./decision_trees.py", line 12, in
mglearn.plots.plot_animal_tree()
File "/home/ycheng/anaconda3/envs/machine-learning/lib/python3.8/site-packages/mglearn/plot_animal_tree.py", line 6, in plot_animal_tree
import graphviz
ModuleNotFoundError: No module named 'graphviz'
```
#### output

### Decision tree in Breast cancel data set
```
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("[ without pre-pruning (Unpruned tree) ]")
print("- Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("- Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
# with pre-pruning
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("[ with pre-pruning ]")
print("- Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("- Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
# Analyzing decision trees
from sklearn.tree import export_graphviz
export_graphviz(tree,
out_file="tree.dot",
class_names=["maligant", "benign"],
feature_names=cancer.feature_names,
impurity=False,
filled=True)
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph).view()
# "Feature importance in trees"
print("- Feature importances:\n{}".format(tree.feature_importances_))
import numpy as np
import matplotlib.pyplot as plt
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
plot_feature_importances_cancer(tree)
```
#### output
```
[ without pre-pruning (Unpruned tree) ]
- Accuracy on training set: 1.000
- Accuracy on test set: 0.937
[ with pre-pruning ]
- Accuracy on training set: 0.988
- Accuracy on test set: 0.951
```

```
- Feature importances:
[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0.01019737 0.04839825
0. 0. 0.0024156 0. 0. 0.
0. 0. 0.72682851 0.0458159 0. 0.
0.0141577 0. 0.018188 0.1221132 0.01188548 0. ]
```

### Two-dimensional data set
```
import matplotlib.pyplot as plt
from IPython.display import display
print("[ Two-dimensional data set ]")
tree = mglearn.plots.plot_tree_not_monotone()
display(tree) # does not display image
plt.show() # can show image
plt.close()
```
#### output
```
[ Two-dimensional data set ]
Feature importances: [0. 1.]
```

### DecisionTreeRegressor to ram price data set
```
'''
download ram_price.csv
https://github.com/amueller/introduction_to_ml_with_python/blob/master/data/ram_price.csv
'''
#### show data
import pandas as pd
ram_prices = pd.read_csv("data/ram_price.csv")
plt.semilogy(ram_prices.date, ram_prices.price)
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.show()
#### Comparison of predictions by linear model and DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
# use historical data to forecast prices after the year 2000
data_train = ram_prices[ram_prices.date < 2000]
data_test = ram_prices[ram_prices.date >= 2000]
# predict prices based on date
X_train = data_train.date[:, np.newaxis]
# we use a log-transform to get a simpler relationship of data to target
y_train = np.log(data_train.price)
tree = DecisionTreeRegressor().fit(X_train, y_train)
linear_reg = LinearRegression().fit(X_train, y_train)
# predict on all data
X_all = ram_prices.date[:, np.newaxis]
pred_tree = tree.predict(X_all)
pred_lr = linear_reg.predict(X_all)
# undo log-transform
price_tree = np.exp(pred_tree)
price_lr = np.exp(pred_lr)
plt.semilogy(data_train.date, data_train.price, label="Training data")
plt.semilogy(data_test.date, data_test.price, label="Test data")
plt.semilogy(ram_prices.date, price_tree, label="Tree prediction")
plt.semilogy(ram_prices.date, price_lr, label="Linear prediction")
plt.legend()
plt.show()
plt.close()
'''
The goal in this example is not to point time based data to the decision tree is not good model.
It is to explain that decision tree has special characteristic in its prediction method.
'''
```
#### output




```
[ Random forest on Breast cancer data set ]
- Accuracy on training set: 1.000
- Accuracy on test set: 0.972
[ Gradient boosted regression trees on Breast cancer data set ]
- Accuracy on training set: 1.000
- Accuracy on test set: 0.965
[ Gradient boosted regression trees on Breast cancer data set, max_depth=1 ]
Accuracy on training set: 0.991
Accuracy on test set: 0.972
[ Gradient boosted regression trees on Breast cancer data set, learning_rate=0.01 ]
Accuracy on training set: 0.988
Accuracy on test set: 0.965
```

!!! This feature importance is quite differ to original in book.