# End-to-end Learning of Deterministic Decision Trees
## Reference
https://qiita.com/itok_msi/items/b8b8b0e915533ece430a
https://arxiv.org/pdf/1712.02743.pdf
## Outline
[この記事](https://qiita.com/itok_msi/items/b8b8b0e915533ece430a)が分かりやすいのでそっちで説明

## 計算例
例えば、あるデータ$x$ の入力に対して、$\sigma(f(x))$ の結果、確率が下記図のようになったとする。

この時の、推論結果の確率分布は、左のnode から順に
$$
0.9\times0.2\times\pi_1 \\
0.9\times0.8\times0.9\times\pi_2 \\
0.9\times0.8\times0.1\times\pi_3 \\
0.1\times0.3\times\pi_4 \\
0.1\times0.7\times\pi_5
$$
全てを足して
$$
\pi=0.18\pi_1+0.648\pi_2+0.072\pi_3+0.03\pi_4+0.07\pi_5
$$
となる。
これがこのネットワークの出力になるので、CrossEntropy等を使って学習できる。
回帰の場合はよく分からない。(でも多分、確率分布の代わりに1つの値を平均値と見立てて推論して、平均二乗誤差などで学習するはず)
## 実装
EMアルゴリズムはよく分からないので、普通に勾配降下法で実装。
titanic データでRandomForestと比較してみた。
acc nn: 0.752542372881356
acc rf: 0.7932203389830509
うーん。精度は置いておいてまともに動いてそう
以下、実装
requirements.txt
```
absl-py==0.12.0
alembic==1.5.7
attrs==20.3.0
cachetools==4.2.1
certifi==2020.12.5
chardet==4.0.0
cliff==3.7.0
cmaes==0.8.2
cmd2==1.5.0
colorama==0.4.4
colorlog==4.7.2
cycler==0.10.0
google-auth==1.28.0
google-auth-oauthlib==0.4.3
greenlet==1.0.0
grpcio==1.36.1
idna==2.10
importlib-metadata==3.7.3
joblib==1.0.1
kiwisolver==1.3.1
kkutils @ git+https://github.com/kazukingh01/kkpackages.git@c13d205f14a4243cfc257daef39518ea235dab4b#subdirectory=kkutils/
Mako==1.1.4
Markdown==3.3.4
MarkupSafe==1.1.1
matplotlib==3.3.4
more-itertools==8.7.0
numpy==1.19.5
oauthlib==3.1.0
optuna==2.6.0
packaging==20.9
pandas==1.2.3
pbr==5.5.1
Pillow==8.1.2
prettytable==2.1.0
protobuf==3.15.6
pyasn1==0.4.8
pyasn1-modules==0.2.8
pyparsing==2.4.7
pyperclip==1.8.2
python-dateutil==2.8.1
python-editor==1.0.4
pytz==2021.1
PyYAML==5.4.1
requests==2.25.1
requests-oauthlib==1.3.0
rsa==4.7.2
scikit-learn==0.24.1
scipy==1.6.1
six==1.15.0
sklearn==0.0
SQLAlchemy==1.4.1
stevedore==3.3.0
tensorboard==2.4.1
tensorboard-plugin-wit==1.8.0
threadpoolctl==2.1.0
torch==1.7.1
torchvision==0.8.2
tqdm==4.59.0
typing-extensions==3.7.4.3
urllib3==1.26.4
wcwidth==0.2.5
Werkzeug==1.0.1
zipp==3.4.1
```
nn_tree.py
```python
import numpy as np
import pandas as pd
import torch
from torch import nn
# local package
from kkutils.lib.nn.model import BaseNN
from kkutils.lib.ml.base_model import MyModel
import kkutils.lib.ml.procs as kkpr
from kkutils.util.ml.utils import split_data_balance
from kkutils.util.torch.funcs import accuracy
class LeafNodes(nn.Module):
def __init__(self, n_node: int, n_classes: int):
super().__init__()
self.dist_prob = nn.Linear(n_node, n_classes, bias=False)
self.softmax = nn.Softmax(dim=0)
def forward(self, _input):
prob = self.softmax(self.dist_prob.weight.T)
output = [(_input * prob[:, i]).sum(axis=1) for i in range(prob.shape[-1])]
output = torch.cat([x.reshape(-1, 1) for x in output], axis=1)
return output
class NNModule(nn.Module):
def __init__(self, n_features: int):
super().__init__()
self.mod = nn.Linear(n_features, 1)
self.sig = nn.Sigmoid()
def forward(self, _input):
output = self.mod(_input)
output = self.sig(output)
return output
class NNDecisionTree(nn.Module):
def __init__(self, nn_func, n_classes: int, depth: int):
"""
Params::
nn_func: function. nn_func() -> nn.Module
n_classes: number of classes.
depth: tree depth.
"""
super().__init__()
self.n_classes = n_classes
self.depth = depth
self.list_nn = []
for i in range(depth ** 2 - 1):
self.add_module(str(i), nn_func()) # addmoduleしないと to や parameter()の処理が面倒
self.list_nn.append(self.__getattr__(str(i)))
self.add_module("leaf", LeafNodes(depth ** 2, n_classes))
self.leaf_nodes = self.__getattr__("leaf")
def forward(self, _input: torch.Tensor):
output = _input.clone()
list_output = []
# 全分岐の確率計算
for mod in self.list_nn:
_output = mod(output)
list_output.append(
torch.cat([_output, 1 - _output], axis=1).reshape(-1, 2, 1)
)
output = torch.cat(list_output, axis=2)
# LeafNodeへのpathの確率計算
list_output = []
for i in range(self.depth ** 2):
index = self.depth ** 2 - 1 + i
a, b = self.path_to_node(index)
list_output.append(output[:, :, a][:, b].prod(axis=1).reshape(-1, 1))
output = torch.cat(list_output, axis=1)
output = self.leaf_nodes(output)
return output
@classmethod
def path_to_node(cls, node_id: int):
"""
Root node id = 0 で、同じ階層の右から id を振った時、各 leaf node(8 - 15) に到達するための id の順序は
0R 1R 3R 7
0R 1R 3L 8
0R 1L 4R 9
0R 1L 4L 10
0L 2R 5R 11
0L 2R 5L 12
0L 2L 6R 13
0L 2L 6L 14
"""
depth = int(np.log2(node_id + 1))
list_index = [int((node_id - 1)/2)]
for _ in range(depth - 1):
list_index.insert(0, int((list_index[0] - 1)/2))
list_LR = []
a = node_id + 1
for _ in range(depth):
list_LR.insert(0, [1, 0] if a % 2 == 0 else [0, 1])
a = a // 2
return list_index, torch.Tensor(list_LR).T.to(torch.bool)
def pre_processing(df: pd.DataFrame):
df = df.copy()
df["Cabin_x"] = np.nan
df.loc[~(df["Cabin"].isna()), "Cabin_x"] = df.loc[~(df["Cabin"].isna()), "Cabin"].str[0].copy().values
df = df.drop(columns=["Name", "PassengerId", "Cabin", "Ticket"])
return df
if __name__ == "__main__":
# data load
df_learn = pd.read_csv("./input/train.csv")
df_test = pd.read_csv("./input/test.csv")
# data pre-processing
df_learn = pre_processing(df_learn)
df_test = pre_processing(df_test )
# mymodel
colname_answer = "Survived"
mymodel = MyModel(
"titanic",
colname_explain=df_learn.columns[df_learn.columns != colname_answer].tolist(),
colname_answer=[colname_answer],
)
mymodel.preproc.register(
[
kkpr.MyOneHotAuto(n_unique_first=30, n_unique_second=30),
kkpr.MyAsType(np.float32),
kkpr.MyReplaceValue(float( "inf"), float("nan")),
kkpr.MyReplaceValue(float("-inf"), float("nan")),
kkpr.MyStandardScaler(),
kkpr.MyFillNa("mean"),
], type_proc="x"
)
mymodel.preproc.register(
[
kkpr.MyAsType(np.int64),
kkpr.MyReshape(-1),
], type_proc="y"
)
mymodel.preproc.register(
[
kkpr.MyConvStrAuto(),
kkpr.MyDropNa(mymodel.colname_answer)
], type_proc="row"
)
# cross validation
df_pred = []
indexes_train, indexes_valid = split_data_balance(df_learn[colname_answer].values, n_splits=5)
for index_train, index_valid in zip(indexes_train, indexes_valid):
df_train = df_learn.iloc[index_train].copy()
df_valid = df_learn.iloc[index_valid].copy()
# preproc fit
mymodel.preproc.fit(df_train)
x_train, y_train = mymodel.preproc(df_train, autofix=True)
x_valid, y_valid = mymodel.preproc(df_valid, autofix=True)
# nn model
def nn_func() -> nn.Module: return NNModule(x_train.shape[1])
mod = NNDecisionTree(nn_func, 2, 4)
trainer = BaseNN(
mod, mtype="cls",
loss_funcs=[nn.CrossEntropyLoss()],
loss_funcs_valid=[[nn.CrossEntropyLoss(), accuracy]],
optimizer=torch.optim.SGD, optim_params={"lr": 1.0},
valid_step=2, early_stopping_i_valid=1, early_stopping_rounds=100,
epoch=10000,
)
trainer.to_cuda()
# fit
trainer.fit(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
trainer.load(is_best=True)
# predict
_pred, _label = trainer.predict(_x=x_valid, _y=y_valid)
dfwk = pd.DataFrame(_pred, columns=["pred_nn"])
dfwk["answer"] = _label
print(f'acc nn: {(dfwk["pred_nn"] == dfwk["answer"]).sum() / dfwk.shape[0]}')
# random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
dfwk["pred_rf"] = rf.predict(x_valid)
print(f'acc rf: {(dfwk["pred_rf"] == dfwk["answer"]).sum() / dfwk.shape[0]}')
df_pred.append(dfwk.copy())
df_pred = pd.concat(df_pred, axis=0, ignore_index=True, sort=False)
print(f'acc nn: {(df_pred["pred_nn"] == df_pred["answer"]).sum() / df_pred.shape[0]}')
print(f'acc rf: {(df_pred["pred_rf"] == df_pred["answer"]).sum() / df_pred.shape[0]}')
```