Define-By-Run

Hyperband Slides

tf.function

W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
b = tf.Variable(tf.zeros(10))
c = tf.Variable(0)

x = tf.placeholder(tf.float32)
ctr = c.assign_add(1)
with tf.control_dependencies([ctr]):
    y = tf.matmul(x, W) + b
    init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    print(sess.run(y, feed_dict={x: make_input_value()}))
    assert int(sess.run(c)) == 1
W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
b = tf.Variable(tf.zeros(10))
c = tf.Variable(0)

@tf.function
def f(x):
    c.assign_add(1)
    return tf.matmul(x, W) + b

print(f(make_input_value())
assert int(c) == 1

Exporting/Importing Models

Checkpoint

W = tf.get_variable("weights", shape=[10, 10])

train_op = W.assign_add(1.)
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(W.initializer)
    sess.run(train_op)
    saver.save(sess, "/tmp/checkpoint/")

with tf.Session() as sess:
    saver.restore(sess, "/tmp/checkpoint/")
    sess.run(train_op)
W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))

@tf.function
def train():
    W.assign_add(1.)

train()
ckpt = tf.train.Checkpoint(W=W)
ckpt.save("/tmp/checkpoint")
ckpt.restore("/tmp/checkpoint")

GraphDefs

W = tf.get_variable("weights", shape=[10, 10])
x = tf.placeholder(tf.float32, shape=(None, 10)))
y = tf.matmul(x, W)

graph = tf.get_default_graph()
graph_def =  graph.as_graph_def()
with open("/tmp/graph.pb", "w") as f:
    f.write(graph_def.SerializeToString())

tf.reset_default_graph()

graph_def = tf.GraphDef()
with open("/tmp/graph.pbtxt") as f:
    graph_def.ParseFromString(f.read())

tf.import_graph_def(graph_def)
W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))

@tf.function
def f(x):
  return tf.matmul(x, W)


graph = f.graph_function((tf.float32, (None, 10)).graph
graph_def = graph.as_graph_def()

with open("/tmp/graph.pb", "w") as f:
    f.write(graph_def.SerializeToString())

SavedModels

def save_model():
    W = tf.get_variable("weights",
                  shape=[10, 10])
    x = tf.placeholder(
    tf.float32, shape=(None, 10))
    y = tf.matmul(x, W)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf.saved_model.simple_save(
            sess,
            "/tmp/model",
            inputs={"x": x},
            outputs={"y": y})

def load_model():
    sess = tf.Session()
    with sess.as_default():
        inputs, outputs = tf.saved_model.simple_load(sess, "/tmp/model")
    return inputs, outputs, sess
class Model(tf.train.Checkpointable):
    def __init__(self):
        self.W = tf.Variable(...)

    @tf.function
    def f(self, x):
        return tf.matmul(x, self.W)

m = Model()
tf.saved_model.export("/tmp/model", m)
m = tf.saved_model.import("/tmp/model")

Optuna vs Hyperopt

Optuna white paper

Original

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

def learn_mnist(layers):
    mnist = fetch_openml('mnist_784')
    (x_train, x_test, y_train, y_test) = train_test_split(mnist.data, mnist.target)
    clf = MLPClassifier(tuple(layers))
    clf.fit(x_train, y_train)
    return 1.0 - clf.score(x_test, y_test)
import optuna

def objective (trial):
    n_layers = trial.suggest_int('n_layers', 1, 4)
    layers = []
    for i in range(n_layers):
        layers.append(trial.suggest_int('n_units_l{}'.format(i), 1, 128))
    return learn_mnist(layers)

study = optuna.create_study()
study.optimize(objective, n_trials=100)
from hyperopt import hp, fmin, tpe

space = {
    'n_units_l1': hp.randint('n_units_l1', 128),
    'l2': hp.choice('l2', [{
        'has_l2': True,
        'n_units_l2': hp.randint('n_units_l2', 128),
        'l3': hp.choice(’l3’, [{
            'has_l3': True,
            'n_units_l3': hp.randint('n_units_l3', 128),
            'l4': hp.choice('l4', [{
                'has_l4': True,
                'n_units_l4': hp.randint('n_units_l4', 128),
            }, {'has_l4': False }]),
        }, {'has_l3': False }]),
    }, {'has_l2': False }]),
}

def objective (space):
    layers = [space['n_units_l1'] + 1]
    for i in range(2, 5):
        space = space['l{}'.format(i)]
        if not space['has_l{}'.format(i)]:
            break
        layers.append(space['n_units_l{}'.format(i)] + 1)
    return learn_mnist(layers)

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)

Optuna

import optuna

def objective(trial):
    n_layers = trial.suggest_int('n_layers', 1, 4)
    layers = [
        trial.suggest_int(f'n_units_l{i}', 1, 128)
        for i in range(n_layers)
    ]
    print(layers)
    return 0

study = optuna.create_study()
study.optimize(objective, n_trials=100)

Hyperopt

from hyperopt import hp, fmin, tpe
import sys

space = {
    f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1
    for i in range(4)
}
space['n_layers'] = hp.randint('n_layers', 4) + 1

def objective(space):
    n_layers = space['n_layers']
    layers = [
        space[f'n_units_l{i}']
        for i in range(n_layers)
    ]
    print(layers, file=sys.stderr)
    return 0

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)
from hyperopt import hp, fmin, tpe
import sys

def build(n, i=0):
    if i == n:
        return {f'has_l{i}': False}
    s = {
        f'has_l{i}': True,
        f'n_units_l{i}': hp.randint(f'n_units_l{i}', 128) + 1,
        f'l{i + 1}': hp.choice(f'l{i + 1}', [
            {f'has_l{i + 1}': False},
            build(n, i + 1),
        ]),
    }
    return s

space = build(4)

def objective(space):
    layers = []
    i = 0
    while space[f'has_l{i}']:
        layers.append(space[f'n_units_l{i}'])
        i = i + 1
        space = space[f'l{i}']
    print(layers, file=sys.stderr)
    return 0

fmin(fn=objective, space=space, max_evals=100, algo=tpe.suggest)

Histogram

for f in *.txt ; do
    echo $f
    tr -cd ',\n' < $f | awk '{ print length; }' | distribution --char='|' | sort
done
param-optuna.txt
0|17 (17.00%) ||||||||||||||||||||||||||||||
1|37 (37.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2|23 (23.00%) |||||||||||||||||||||||||||||||||||||||||
3|23 (23.00%) |||||||||||||||||||||||||||||||||||||||||
param-hyperopt-new.txt
0|25 (25.00%) ||||||||||||||||||||||||||||||||||||||||||||||
1|36 (36.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2|24 (24.00%) ||||||||||||||||||||||||||||||||||||||||||||
3|15 (15.00%) ||||||||||||||||||||||||||||
param-hyperopt-old.txt
0|61 (61.00%) ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1|25 (25.00%) |||||||||||||||||||||||||||
2| 6  (6.00%) |||||||
3| 8  (8.00%) |||||||||

Appendix

Hyperband

γ(x)=maxn=1,,Nsuptxn(t)
γ1(y)=inf{x|γ(x)y}
νn=limtn(t) are iid r.v.s with cdf F
ν=maxn=1,,Nνn
H(F,γ,n,δ,ϵ)=2nν+ϵ/4γ1(tν4)dF(t)+(43log2δ+2nF(ν+ϵ4))γ1(ϵ16)

where
δ(0,1),pn=log(2/δ)/n,ϵ/4F1(pn)ν,
and
H(F,γ,n,δ)=H(F,γ,n,δ,4(F1(pn)ν))

Accelerated Gradient Descent

Let

f be convex and
L
-smooth. Then,
f(xt)f(x)2L|x1x|t2
where
xt+1=ytf(yt)β,yt+1=(1γt)xt+1+γtxt,γt=1λtλt+1,λt+1=1+1+4λt22
with
λ0=x1=y1=0
.