# Define-By-Run [Hyperband Slides](https://docs.google.com/presentation/d/1IKLkol21DR0tQnoYxtre4Lt2Q50zdJYxgmhaZtasPN8/edit?usp=sharing) ## tf.function ```python W = tf.Variable(tf.glorot_uniform_initializer()((10, 10))) b = tf.Variable(tf.zeros(10)) c = tf.Variable(0) x = tf.placeholder(tf.float32) ctr = c.assign_add(1) with tf.control_dependencies([ctr]): y = tf.matmul(x, W) + b init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) print(sess.run(y, feed_dict={x: make_input_value()})) assert int(sess.run(c)) == 1 ``` ```python W = tf.Variable(tf.glorot_uniform_initializer()((10, 10))) b = tf.Variable(tf.zeros(10)) c = tf.Variable(0) @tf.function def f(x): c.assign_add(1) return tf.matmul(x, W) + b print(f(make_input_value()) assert int(c) == 1 ``` ## Exporting/Importing Models ### Checkpoint ```python W = tf.get_variable("weights", shape=[10, 10]) train_op = W.assign_add(1.) saver = tf.train.Saver() with tf.Session() as sess: sess.run(W.initializer) sess.run(train_op) saver.save(sess, "/tmp/checkpoint/") with tf.Session() as sess: saver.restore(sess, "/tmp/checkpoint/") sess.run(train_op) ``` ```python W = tf.Variable(tf.glorot_uniform_initializer()((10, 10))) @tf.function def train(): W.assign_add(1.) train() ckpt = tf.train.Checkpoint(W=W) ckpt.save("/tmp/checkpoint") ckpt.restore("/tmp/checkpoint") ``` ### GraphDefs ```python W = tf.get_variable("weights", shape=[10, 10]) x = tf.placeholder(tf.float32, shape=(None, 10))) y = tf.matmul(x, W) graph = tf.get_default_graph() graph_def = graph.as_graph_def() with open("/tmp/graph.pb", "w") as f: f.write(graph_def.SerializeToString()) tf.reset_default_graph() graph_def = tf.GraphDef() with open("/tmp/graph.pbtxt") as f: graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) ``` ```python W = tf.Variable(tf.glorot_uniform_initializer()((10, 10))) @tf.function def f(x): return tf.matmul(x, W) graph = f.graph_function((tf.float32, (None, 10)).graph graph_def = graph.as_graph_def() with open("/tmp/graph.pb", "w") as f: f.write(graph_def.SerializeToString()) ``` ### SavedModels ```python def save_model(): W = tf.get_variable("weights", shape=[10, 10]) x = tf.placeholder( tf.float32, shape=(None, 10)) y = tf.matmul(x, W) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf.saved_model.simple_save( sess, "/tmp/model", inputs={"x": x}, outputs={"y": y}) def load_model(): sess = tf.Session() with sess.as_default(): inputs, outputs = tf.saved_model.simple_load(sess, "/tmp/model") return inputs, outputs, sess ``` ```python class Model(tf.train.Checkpointable): def __init__(self): self.W = tf.Variable(...) @tf.function def f(self, x): return tf.matmul(x, self.W) m = Model() tf.saved_model.export("/tmp/model", m) m = tf.saved_model.import("/tmp/model") ``` ## Optuna vs Hyperopt [Optuna white paper](https://arxiv.org/pdf/1907.10902) ### Original ```python from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPClassifier def learn_mnist(layers): mnist = fetch_openml('mnist_784') (x_train, x_test, y_train, y_test) = train_test_split(mnist.data, mnist.target) clf = MLPClassifier(tuple(layers)) clf.fit(x_train, y_train) return 1.0 - clf.score(x_test, y_test) ``` ```python import optuna def objective (trial): n_layers = trial.suggest_int('n_layers', 1, 4) layers = [] for i in range(n_layers): layers.append(trial.suggest_int('n_units_l{}'.format(i), 1, 128)) return learn_mnist(layers) study = optuna.create_study() study.optimize(objective, n_trials=100) ``` ```python from hyperopt import hp, fmin, tpe space = { 'n_units_l1': hp.randint('n_units_l1', 128), 'l2': hp.choice('l2', [{ 'has_l2': True, 'n_units_l2': hp.randint('n_units_l2', 128), 'l3': hp.choice(’l3’, [{ 'has_l3': True, 'n_units_l3': hp.randint('n_units_l3', 128), 'l4': hp.choice('l4', [{ 'has_l4': True, 'n_units_l4': hp.randint('n_units_l4', 128), }, {'has_l4': False }]), }, {'has_l3': False }]), }, {'has_l2': False }]), } def objective (space): layers = [space['n_units_l1'] + 1] for i in range(2, 5): space = space['l{}'.format(i)] if not space['has_l{}'.format(i)]: break layers.append(space['n_units_l{}'.format(i)] + 1) return learn_mnist(layers) fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest) ``` ### Optuna ```python import optuna def objective(trial): n_layers = trial.suggest_int('n_layers', 1, 4) layers = [ trial.suggest_int(f'n_units_l{i}', 1, 128) for i in range(n_layers) ] print(layers) return 0 study = optuna.create_study() study.optimize(objective, n_trials=100) ``` ### Hyperopt ```python from hyperopt import hp, fmin, tpe import sys space = { f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1 for i in range(4) } space['n_layers'] = hp.randint('n_layers', 4) + 1 def objective(space): n_layers = space['n_layers'] layers = [ space[f'n_units_l{i}'] for i in range(n_layers) ] print(layers, file=sys.stderr) return 0 fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest) ``` ```python from hyperopt import hp, fmin, tpe import sys def build(n, i=0): if i == n: return {f'has_l{i}': False} s = { f'has_l{i}': True, f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1, f'l{i + 1}': hp.choice(f'l{i + 1}', [ {f'has_l{i + 1}': False}, build(n, i + 1), ]), } return s space = build(4) def objective(space): layers = [] i = 0 while space[f'has_l{i}']: layers.append(space[f'n_units_l{i}']) i = i + 1 space = space[f'l{i}'] print(layers, file=sys.stderr) return 0 fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest) ``` ### Histogram ```bash for f in *.txt ; do echo $f tr -cd ',\n' < $f | awk '{ print length; }' | distribution --char='|' | sort done ``` ``` param-optuna.txt 0|17 (17.00%) |||||||||||||||||||||||||||||| 1|37 (37.00%) |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 2|23 (23.00%) ||||||||||||||||||||||||||||||||||||||||| 3|23 (23.00%) ||||||||||||||||||||||||||||||||||||||||| param-hyperopt-new.txt 0|25 (25.00%) |||||||||||||||||||||||||||||||||||||||||||||| 1|36 (36.00%) |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 2|24 (24.00%) |||||||||||||||||||||||||||||||||||||||||||| 3|15 (15.00%) |||||||||||||||||||||||||||| param-hyperopt-old.txt 0|61 (61.00%) |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 1|25 (25.00%) ||||||||||||||||||||||||||| 2| 6 (6.00%) ||||||| 3| 8 (8.00%) ||||||||| ``` ## Appendix ### Hyperband $$ \gamma(x) = \max_{n=1,\dots,N} \sup_{t \geq x} \ell_n(t) $$$$ \gamma^{-1}(y) = \inf\{x | \gamma(x) \leq y\} $$$$ \nu_n = \lim_{t\to\infty} \ell_n(t) \quad\text{ are iid r.v.s with cdf $F$} $$$$ \nu_* = \max_{n=1,\dots,N} \nu_n $$$$ H(F,\gamma,n,\delta,\epsilon) = 2n\int_{\nu_* + \epsilon/4}^\infty \gamma^{-1}\left( \frac{t-\nu_*}{4} \right) dF(t) + \left( \frac{4}{3}\log\frac{2}{\delta} + 2n F\left( \nu_* + \frac{\epsilon}{4} \right) \right) \gamma^{-1}\left(\frac{\epsilon}{16}\right) $$ where $\delta\in(0,1) ,\, p_n = \log(2/\delta)/n ,\, \epsilon/4 \geq F^{-1}(p_n) - \nu_* ,\,$ and $$ H(F,\gamma,n,\delta) = H(F,\gamma,n,\delta,4(F^{-1}(p_n) - \nu_*)) $$ ### Accelerated Gradient Descent Let $f$ be convex and $L$-smooth. Then, $$ f(x_t) - f(x_*) \leq \frac{2L|x_1 - x_*|}{t^2} $$ where $$ x_{t+1} = y_t - \frac{\nabla f(y_t)}{\beta} ,\, y_{t+1} = (1-\gamma_t)x_{t+1} + \gamma_t x_t ,\, \gamma_t = \frac{1-\lambda_t}{\lambda_{t+1}} ,\, \lambda_{t+1} = \frac{1 + \sqrt{1+4\lambda_t^2}}{2} $$ with $\lambda_0 = x_1 = y_1 = 0$.