Define-By-Run

tf.function

W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
b = tf.Variable(tf.zeros(10))
c = tf.Variable(0)

x = tf.placeholder(tf.float32)
ctr = c.assign_add(1)
with tf.control_dependencies([ctr]):
    y = tf.matmul(x, W) + b
    init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    print(sess.run(y, feed_dict={x: make_input_value()}))
    assert int(sess.run(c)) == 1

W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
b = tf.Variable(tf.zeros(10))
c = tf.Variable(0)

@tf.function
def f(x):
    c.assign_add(1)
    return tf.matmul(x, W) + b

print(f(make_input_value())
assert int(c) == 1

Exporting/Importing Models

Checkpoint

W = tf.get_variable("weights", shape=[10, 10])

train_op = W.assign_add(1.)
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(W.initializer)
    sess.run(train_op)
    saver.save(sess, "/tmp/checkpoint/")

with tf.Session() as sess:
    saver.restore(sess, "/tmp/checkpoint/")
    sess.run(train_op)

W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))

@tf.function
def train():
    W.assign_add(1.)

train()
ckpt = tf.train.Checkpoint(W=W)
ckpt.save("/tmp/checkpoint")
ckpt.restore("/tmp/checkpoint")

GraphDefs

W = tf.get_variable("weights", shape=[10, 10])
x = tf.placeholder(tf.float32, shape=(None, 10)))
y = tf.matmul(x, W)

graph = tf.get_default_graph()
graph_def =  graph.as_graph_def()
with open("/tmp/graph.pb", "w") as f:
    f.write(graph_def.SerializeToString())

tf.reset_default_graph()

graph_def = tf.GraphDef()
with open("/tmp/graph.pbtxt") as f:
    graph_def.ParseFromString(f.read())

tf.import_graph_def(graph_def)

W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))

@tf.function
def f(x):
  return tf.matmul(x, W)


graph = f.graph_function((tf.float32, (None, 10)).graph
graph_def = graph.as_graph_def()

with open("/tmp/graph.pb", "w") as f:
    f.write(graph_def.SerializeToString())

SavedModels

def save_model():
    W = tf.get_variable("weights",
                  shape=[10, 10])
    x = tf.placeholder(
    tf.float32, shape=(None, 10))
    y = tf.matmul(x, W)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf.saved_model.simple_save(
            sess,
            "/tmp/model",
            inputs={"x": x},
            outputs={"y": y})

def load_model():
    sess = tf.Session()
    with sess.as_default():
        inputs, outputs = tf.saved_model.simple_load(sess, "/tmp/model")
    return inputs, outputs, sess

class Model(tf.train.Checkpointable):
    def __init__(self):
        self.W = tf.Variable(...)

    @tf.function
    def f(self, x):
        return tf.matmul(x, self.W)

m = Model()
tf.saved_model.export("/tmp/model", m)
m = tf.saved_model.import("/tmp/model")

Optuna vs Hyperopt

Optuna white paper

Original

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

def learn_mnist(layers):
    mnist = fetch_openml('mnist_784')
    (x_train, x_test, y_train, y_test) = train_test_split(mnist.data, mnist.target)
    clf = MLPClassifier(tuple(layers))
    clf.fit(x_train, y_train)
    return 1.0 - clf.score(x_test, y_test)

import optuna

def objective (trial):
    n_layers = trial.suggest_int('n_layers', 1, 4)
    layers = []
    for i in range(n_layers):
        layers.append(trial.suggest_int('n_units_l{}'.format(i), 1, 128))
    return learn_mnist(layers)

study = optuna.create_study()
study.optimize(objective, n_trials=100)

from hyperopt import hp, fmin, tpe

space = {
    'n_units_l1': hp.randint('n_units_l1', 128),
    'l2': hp.choice('l2', [{
        'has_l2': True,
        'n_units_l2': hp.randint('n_units_l2', 128),
        'l3': hp.choice(’l3’, [{
            'has_l3': True,
            'n_units_l3': hp.randint('n_units_l3', 128),
            'l4': hp.choice('l4', [{
                'has_l4': True,
                'n_units_l4': hp.randint('n_units_l4', 128),
            }, {'has_l4': False }]),
        }, {'has_l3': False }]),
    }, {'has_l2': False }]),
}

def objective (space):
    layers = [space['n_units_l1'] + 1]
    for i in range(2, 5):
        space = space['l{}'.format(i)]
        if not space['has_l{}'.format(i)]:
            break
        layers.append(space['n_units_l{}'.format(i)] + 1)
    return learn_mnist(layers)

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)

Optuna

import optuna

def objective(trial):
    n_layers = trial.suggest_int('n_layers', 1, 4)
    layers = [
        trial.suggest_int(f'n_units_l{i}', 1, 128)
        for i in range(n_layers)
    ]
    print(layers)
    return 0

study = optuna.create_study()
study.optimize(objective, n_trials=100)

Hyperopt

from hyperopt import hp, fmin, tpe
import sys

space = {
    f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1
    for i in range(4)
}
space['n_layers'] = hp.randint('n_layers', 4) + 1

def objective(space):
    n_layers = space['n_layers']
    layers = [
        space[f'n_units_l{i}']
        for i in range(n_layers)
    ]
    print(layers, file=sys.stderr)
    return 0

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)

from hyperopt import hp, fmin, tpe
import sys

def build(n, i=0):
    if i == n:
        return {f'has_l{i}': False}
    s = {
        f'has_l{i}': True,
        f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1,
        f'l{i + 1}': hp.choice(f'l{i + 1}', [
            {f'has_l{i + 1}': False},
            build(n, i + 1),
        ]),
    }
    return s

space = build(4)

def objective(space):
    layers = []
    i = 0
    while space[f'has_l{i}']:
        layers.append(space[f'n_units_l{i}'])
        i = i + 1
        space = space[f'l{i}']
    print(layers, file=sys.stderr)
    return 0

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)

Histogram

for f in *.txt ; do
    echo $f
    tr -cd ',\n' < $f | awk '{ print length; }' | distribution --char='|' | sort
done

param-optuna.txt
0|17 (17.00%) ||||||||||||||||||||||||||||||
1|37 (37.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2|23 (23.00%) |||||||||||||||||||||||||||||||||||||||||
3|23 (23.00%) |||||||||||||||||||||||||||||||||||||||||
param-hyperopt-new.txt
0|25 (25.00%) ||||||||||||||||||||||||||||||||||||||||||||||
1|36 (36.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2|24 (24.00%) ||||||||||||||||||||||||||||||||||||||||||||
3|15 (15.00%) ||||||||||||||||||||||||||||
param-hyperopt-old.txt
0|61 (61.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1|25 (25.00%) |||||||||||||||||||||||||||
2| 6  (6.00%) |||||||
3| 8  (8.00%) |||||||||

Appendix

Hyperband

γ (x) = max_{n = 1, \dots, N} sup_{t \geq x} ℓ_{n} (t)

γ^{- 1} (y) = inf {x | γ (x) \leq y}

ν_{n} = lim_{t \to \infty} ℓ_{n} (t) are iid r.v.s with cdf F

ν_{*} = max_{n = 1, \dots, N} ν_{n}

H (F, γ, n, δ, ϵ) = 2 n \int_{ν_{*} + ϵ / 4}^{\infty} γ^{- 1} (\frac{t - ν_{*}}{4}) d F (t) + (\frac{4}{3} \log \frac{2}{δ} + 2 n F (ν_{*} + \frac{ϵ}{4})) γ^{- 1} (\frac{ϵ}{16})

where

δ \in (0, 1), p_{n} = \log (2 / δ) / n, ϵ / 4 \geq F^{- 1} (p_{n}) - ν_{*},

and

H (F, γ, n, δ) = H (F, γ, n, δ, 4 (F^{- 1} (p_{n}) - ν_{*}))

Accelerated Gradient Descent

Let

f

be convex and

L

-smooth. Then,

f (x_{t}) - f (x_{*}) \leq \frac{2 L | x_{1} - x_{*} |}{t^{2}}

where

x_{t + 1} = y_{t} - \frac{\nabla f (y_{t})}{β}, y_{t + 1} = (1 - γ_{t}) x_{t + 1} + γ_{t} x_{t}, γ_{t} = \frac{1 - λ_{t}}{λ_{t + 1}}, λ_{t + 1} = \frac{1 + \sqrt{1 + 4 λ_{t}^{2}}}{2}

with

λ_{0} = x_{1} = y_{1} = 0

Define-By-Run

tf.function

Exporting/Importing Models

Checkpoint

GraphDefs

SavedModels

Optuna vs Hyperopt

Original

Optuna

Hyperopt

Histogram

Appendix

Hyperband

Accelerated Gradient Descent

Read more

throwaway

Entropy and Irreversibility in the Quantum Realm

Series Convergence by Comparison

Report 4/27