This post is a rebroadcast of R2RT’s bst post. It talks about ST and REINFORCE estimator used in BSN network.

import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)
from tensorflow.python.framework import ops
from enum import Enum
import seaborn as sns
sns.set(color_codes=True)
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def layer_linear(inputs, shape, scope='linear_layer'):
    with tf.variable_scope(scope):
        w = tf.get_variable('w', shape)
        b = tf.get_variable('b', shape[-1:])
    return tf.matmul(inputs, w) + b

def layer_softmax(inputs, shape, scope='softmax_layer'):
    with tf.variable_scope(scope):
        w = tf.get_variable('w', shape)
        b = tf.get_variable('b', shape[-1:])
    return tf.nn.softmax(tf.matmul(inputs, w) + b)

def accuracy(y, pred):
    correct = tf.equal(tf.argmax(y,1), tf.argmax(pred,1))
    return tf.reduce_mean(tf.cast(correct,tf.float32))

def plot_n(data_and_labels, lower_y = 0., title="Learning Curves"):
    fig, ax = plt.subplots()
    for data, label in data_and_labels:
        ax.plot(range(0,len(data)*100,100),data, label=label)
    ax.set_xlabel('Training steps')
    ax.set_ylabel('Accuracy')
    ax.set_ylim([lower_y,1])
    ax.set_title(title)
    ax.legend(loc=4)
    plt.show()

class StochasticGradientEstimator(Enum):
    ST = 0
    REINFORCE = 1
        

Binary stochastic neuron with straight through estimator

def binaryRound(x):
    """
    ROunds a tensor whose values are in [0,1] to a tensor with values in {0, 1},
    using the straight through estimator for gradient.
    """
    
    g = tf.get_default_graph()
    
    with ops.name_scope("BinaryRound") as name:
        with g.gradient_override_map({"Round": "Identity"}):
            return tf.round(x, name=name)
## I'm bit confused here as well. Not sure about if BernoulliSample_ST is implemented correctly.
def bernoulliSample(x):
    """
    Uses a tensor whose values are in [0, 1] to sample a tensor with values in {0,1}.
    
    E.g.,:
    if x is 0.6, bernoulliSample(x) will be 1 with probability 0.6, and 0 otherwise.
    and the gradient will be pass-through(identity). Note gradient for (x-tf.random_uniform) is still preserved
    """
    
    g = tf.get_default_graph()
    
    with ops.name_scope("BernoulliSample") as name:
        with g.gradient_override_map({"Ceil": "Identity", "Sub": "BernoulliSample_ST"}):
            return tf.ceil(x - tf.random_uniform(tf.shape(x)),name=name)
    
@ops.RegisterGradient("BernoulliSample_ST")
def bernoulliSample_ST(op, grad): 
    # the grad is the op'output w.r.t output (x-tf.random_uniform), seems to be 1 to me??
    return [grad, tf.zeros(tf.shape(op.inputs[1]))]

Combine passthrough with bernoulliSample we then have bsn

def passThroughSigmoid(x, slope=1):
    """Sigmoid that uses identity function as its gradient"""
    g = tf.get_default_graph()
    with ops.name_scope("PassThroughSigmoid") as name:
        with g.gradient_override_map({"Sigmoid": "Identity"}):
            return tf.sigmoid(x, name=name)

def binaryStochastic_ST(x, slope_tensor=None, pass_through=True, stochastic=True):
    """
    bst_st v1:
        pass_through=True, stochastic=True
        x --> passThroughSigmoid --> bernoulliSample
        the d(bst_st)/dx = 1
    bst_st v2:
        pass_through=False, stochastic=False
        x --> tf.sigmoid (with slope annealling) --> binaryRound
        the d(bst_st)/dx = dsigm(slope*x)/dx
        so as slope increases, the bst_st v2 behaves more like step function, 
        which resembles the bst_st‘s {0,1} behavior.
    """
    if slope_tensor is None:
        slope_tensor = tf.constant(1.0)
    
    if pass_through:
        p = passThroughSigmoid(x)
    else:
        p = tf.sigmoid(slope_tensor*x)
        
    if stochastic:
        return bernoulliSample(p)
    else:
        return binaryRound(p)
    

Binary stochastic neuron with REINFORCE estimator

def binaryStochastic_REINFORCE(x, stochastic = True, loss_op_name="loss_by_example"):
    """
    Sigmoid followed by a random sample from a bernoulli distribution according
    to the result (binary stochastic neuron). Uses the REINFORCE estimator.
    See https://arxiv.org/abs/1308.3432.

    NOTE: Requires a loss operation with name matching the argument for loss_op_name
    in the graph. This loss operation should be broken out by example (i.e., not a
    single number for the entire batch).
    """
    g = tf.get_default_graph()

    with ops.name_scope("BinaryStochasticREINFORCE"):
        with g.gradient_override_map({"Sigmoid": "BinaryStochastic_REINFORCE",
                                      "Ceil": "Identity"}):
            p = tf.sigmoid(x)

            reinforce_collection = g.get_collection("REINFORCE")
            if not reinforce_collection:
                g.add_to_collection("REINFORCE", {})
                reinforce_collection = g.get_collection("REINFORCE")
            reinforce_collection[0][p.op.name] = loss_op_name

            return tf.ceil(p - tf.random_uniform(tf.shape(x)))


# TODO: Debug this        

@ops.RegisterGradient("BinaryStochastic_REINFORCE")
def _binaryStochastic_REINFORCE(op, _):
    """Unbiased estimator for binary stochastic function based on REINFORCE."""
    loss_op_name = op.graph.get_collection("REINFORCE")[0][op.name]
    loss_tensor = op.graph.get_operation_by_name(loss_op_name).outputs[0] # [None, 1]

    sub_tensor = op.outputs[0].consumers()[0].outputs[0] #subtraction tensor
    ceil_tensor = sub_tensor.consumers()[0].outputs[0] #ceiling tensor, both the same shape as x

    outcome_diff = (ceil_tensor - op.outputs[0]) # [None, 1]

    # Provides an early out if we want to avoid variance adjustment for
    # whatever reason (e.g., to show that variance adjustment helps)
    if op.graph.get_collection("REINFORCE")[0].get("no_variance_adj"):
        return outcome_diff * tf.expand_dims(loss_tensor, 1)

    outcome_diff_sq = tf.square(outcome_diff) # [None , 1]
    outcome_diff_sq_r = tf.reduce_mean(outcome_diff_sq, reduction_indices=0) #[1, ]
    outcome_diff_sq_loss_r = tf.reduce_mean(outcome_diff_sq * tf.expand_dims(loss_tensor, 1),
                                            reduction_indices=0) # [1, ]

    L_bar_num = tf.Variable(tf.zeros(outcome_diff_sq_r.get_shape()), trainable=False)
    L_bar_den = tf.Variable(tf.ones(outcome_diff_sq_r.get_shape()), trainable=False)

    #Note: we already get a decent estimate of the average from the minibatch
    decay = 0.95
    train_L_bar_num = tf.assign(L_bar_num, L_bar_num*decay +\
                                            outcome_diff_sq_loss_r*(1-decay))
    train_L_bar_den = tf.assign(L_bar_den, L_bar_den*decay +\
                                            outcome_diff_sq_r*(1-decay))

    # I'm not getting the why tensors are shaped this way, need vscode debug
    with tf.control_dependencies([train_L_bar_num, train_L_bar_den]):
        L_bar = train_L_bar_num/(train_L_bar_den+1e-4)
        L = tf.tile(tf.expand_dims(loss_tensor,1),
                    tf.constant([1,L_bar.get_shape().as_list()[0]]))
        return outcome_diff * (L - L_bar)
    

Wrapper to create layer of binary stochastic neurons

def binary_wrapper(\
                  pre_activations_tensor,
                  estimator=StochasticGradientEstimator.ST,
                  stochastic_tensor=tf.constant(True),
                  pass_through=True,
                  slope_tensor=tf.constant(1.0)):
    """
    Turns a layer of pre-activations (logits) into a layer of binary stochastic neurons

    Keyword arguments:
    *estimator: either ST or REINFORCE
    *stochastic_tensor: a boolean tensor indicating whether to sample from a bernoulli
        distribution (True, default) or use a step_function (e.g., for inference)
    *pass_through: for ST only - boolean as to whether to substitute identity derivative on the
        backprop (True, default), or whether to use the derivative of the sigmoid
    *slope_tensor: for ST only - tensor specifying the slope for purposes of slope annealing
        trick
    """
    if estimator == StochasticGradientEstimator.ST:
        if pass_through:
            return tf.cond(stochastic_tensor,
                          lambda: binaryStochastic_ST(pre_activations_tensor),
                          lambda: binaryStochastic_ST(pre_activations_tensor, stochastic=False))
        else:
            return tf.cond(stochastic_tensor,
                    lambda: binaryStochastic_ST(pre_activations_tensor, slope_tensor = slope_tensor,
                                             pass_through=False),
                    lambda: binaryStochastic_ST(pre_activations_tensor, slope_tensor = slope_tensor,
                                             pass_through=False, stochastic=False))
    elif estimator == StochasticGradientEstimator.REINFORCE:
        # binaryStochastic_REINFORCE was designed to only be stochastic, so using the ST version
        # for the step fn for purposes of using step fn at evaluation / not for training
        return tf.cond(stochastic_tensor,
                lambda: binaryStochastic_REINFORCE(pre_activations_tensor),
                lambda: binaryStochastic_ST(pre_activations_tensor, stochastic=False))
    else:
        raise ValueError("Unrecognized estimator.")

Function to build graph for MNIST classifier

def build_classifier(hidden_dims=[100],
                     lr = 0.5,
                     pass_through = True,
                     non_binary = False,
                     estimator = StochasticGradientEstimator.ST,
                     no_var_adj = False):
    reset_graph()
    g = {}
    
    if no_var_adj:
        tf.get_default_graph().add_to_collection("REINFORCE", {"no_variance_adj": no_var_adj})
    
    g['x'] = tf.placeholder(tf.float32, [None, 784], name='x_placeholder')
    g['y'] = tf.placeholder(tf.float32, [None, 10], name='y_placeholder')
    g['stochastic'] = tf.constant(True)
    g['slope'] = tf.constant(1.0)
    
    g['layers'] = {0: g['x']}
    hidden_layers = len(hidden_dims)
    dims = [784] + hidden_dims
    
    for i in range(1, hidden_layers+1):
        with tf.variable_scope("layer_"+str(i)):
            pre_activations = layer_linear(g['layers'][i-1], dims[i-1:i+1], scope='layer_'+str(i))
            if non_binary:
                g['layers'][i] = tf.sigmoid(pre_activations)
            else:
                g['layers'][i] = binary_wrapper(pre_activations,
                                              estimator = estimator,
                                              pass_through = pass_through,
                                              stochastic_tensor = g['stochastic'],
                                              slope_tensor = g['slope'])
                
    g['pred'] = layer_softmax(g['layers'][hidden_layers], [dims[-1], 10])

    g['loss'] = -tf.reduce_mean(g['y'] * tf.log(g['pred']),reduction_indices=1) # standard cross-entropy, not sparse ce since Y is vectorized
    # named loss_by_example necessary for REINFORCE estimator
    tf.identity(g['loss'], name="loss_by_example")

    g['ts'] = tf.train.GradientDescentOptimizer(lr).minimize(g['loss'])

    g['accuracy'] = accuracy(g['y'], g['pred'])

    g['init_op'] = tf.global_variables_initializer()
    return g

Train the classifier

def train_classifier(\
        hidden_dims=[100,100],
        estimator=StochasticGradientEstimator.ST,
        stochastic_train=True,
        stochastic_eval=True,
        slope_annealing_rate=None,
        epochs=10,
        lr=0.5,
        non_binary=False,
        no_var_adj=False,
        train_set = mnist.train,
        val_set = mnist.validation,
        verbose=False,
        label=None):
    if slope_annealing_rate is None:
        g = build_classifier(hidden_dims=hidden_dims, lr=lr, pass_through=True,
                                non_binary=non_binary, estimator=estimator, no_var_adj=no_var_adj)
    else:
        g = build_classifier(hidden_dims=hidden_dims, lr=lr, pass_through=False,
                                non_binary=non_binary, estimator=estimator, no_var_adj=no_var_adj)

    with tf.Session() as sess:
        sess.run(g['init_op'])
        slope = 1
        res_tr, res_val = [], []
        for epoch in range(epochs):
            feed_dict={g['x']: val_set.images,
                       g['y']: val_set.labels,
                       g['stochastic']: stochastic_eval,
                       g['slope']: slope}
            if verbose:
                print("Epoch", epoch, sess.run(g['accuracy'], feed_dict=feed_dict))

            accuracy = 0
            for i in range(1001):
                x, y = train_set.next_batch(50)
                feed_dict={g['x']: x, g['y']: y, g['stochastic']: stochastic_train}
                acc, _ = sess.run([g['accuracy'],g['ts']], feed_dict=feed_dict)
                accuracy += acc
                if i % 100 == 0 and i > 0:
                    res_tr.append(accuracy/100)
                    accuracy = 0
                    feed_dict={g['x']: val_set.images,
                               g['y']: val_set.labels,
                               g['stochastic']: stochastic_eval,
                               g['slope']: slope}
                    res_val.append(sess.run(g['accuracy'], feed_dict=feed_dict))

            if slope_annealing_rate is not None:
                slope = slope*slope_annealing_rate
                if verbose:
                    print("Sigmoid slope:", slope)

        feed_dict={g['x']: val_set.images, g['y']: val_set.labels,
                   g['stochastic']: stochastic_eval, g['slope']: slope}
        print("Epoch", epoch+1, sess.run(g['accuracy'], feed_dict=feed_dict))
        if label is not None:
            return (res_tr, label + " - Training"), (res_val, label + " - Validation")
        else:
            return [(res_tr, "Training"), (res_val, "Validation")]
res = train_classifier(hidden_dims=[100], epochs=20, lr=1.0, non_binary=True) # 20 * 1000 = 20000, 20000/100 = 200
plot_n(res, lower_y=0.8, title="Logistic Sigmoid Baseline")
Epoch 20 0.9658

png

The non-stochastic, non-binary baseline

print("Variance-adjusted:")
res1 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.REINFORCE, epochs=3,
                       lr=0.3, verbose=True)
Variance-adjusted:
Epoch 0 0.0964
Epoch 1 0.0958
Epoch 2 0.0958
Epoch 3 0.0958
print("Not variance-adjusted:")
res2= train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.REINFORCE, epochs=3,
                       lr=0.3, no_var_adj=True, verbose=True)
Not variance-adjusted:
Epoch 0 0.0988
Epoch 1 0.0958
Epoch 2 0.0958
Epoch 3 0.0958
res1 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.REINFORCE, epochs=20,
                       lr=0.05, label = "Variance-adjusted")
res2= train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.REINFORCE, epochs=20,
                       lr=0.05, no_var_adj=True, label = "Not variance-adjusted")

plot_n(res1 + res2, lower_y=0.6, title="Experiment 1: REINFORCE variance adjustment")
Epoch 20 0.9262
Epoch 20 0.9264

png

So variance-adjusted learns a bit faster, but it’s not significant.

res1 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.1, label = "Pass-through - 0.1")
res2 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.1, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 0.1")

res3 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.3, label = "Pass-through - 0.3")
res4 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.3, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 0.3")

res5 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=1.0, label = "Pass-through - 1.0")
res6 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=1.0, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 1.0")

plot_n(res1[1:] + res2[1:] + res3[1:] + res4[1:] + res5[1:] + res6[1:],
       lower_y=0.4, title="Experiment 2: Pass-through vs sigmoid-adjusted ST")
Epoch 20 0.823
Epoch 20 0.9572
Epoch 20 0.7454
Epoch 20 0.968
Epoch 20 0.0958
Epoch 20 0.9516

png

It seems that sigmoid-adjusted beats pass-through by a wide margin

res1 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.1, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 0.1")
res2 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.1, slope_annealing_rate = 1.1, label = "Slope-annealed - 0.1")

res3 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.3, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 0.3")
res4 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=0.3, slope_annealing_rate = 1.1, label = "Slope-annealed - 0.3")

res5 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=1.0, slope_annealing_rate = 1.0, label = "Sigmoid-adjusted - 1.0")
res6 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                       lr=1.0, slope_annealing_rate = 1.1, label = "Slope-annealed - 1.0")

plot_n(res1[1:] + res2[1:] + res3[1:] + res4[1:] + res5[1:] + res6[1:],
       lower_y=0.6, title="Experiment 3: Sigmoid-adjusted vs slope-annealed ST")
Epoch 20 0.9606
Epoch 20 0.9702
Epoch 20 0.9648
Epoch 20 0.9748
Epoch 20 0.9614
Epoch 20 0.9714

png

Stochastic sigmoid-adjusted even beats baseline model

res1 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                        lr=0.3, slope_annealing_rate = 1.1, label = "Stochastic, Stochastic")
res2 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                        lr=0.3, slope_annealing_rate = 1.1, stochastic_eval=False, label = "Stochastic, Deterministic")
res3 = train_classifier(hidden_dims=[100], estimator=StochasticGradientEstimator.ST, epochs=20,
                        lr=0.3, slope_annealing_rate = 1.1, stochastic_train=False, stochastic_eval=False,
                        label = "Deterministic, Deterministic")

plot_n(res1 + res2 + res3,
       lower_y=0.6, title="Experiment 5: Stochastic vs Deterministic (Slope-annealed ST)")
Epoch 20 0.9746
Epoch 20 0.9726
Epoch 20 0.9748

png

The results show that deterministic neurons train the fastest, but also display more overfitting and may not achieve the best final results. Stochastic inference and deterministic inference, when combined with stochastic training, are closely comparable. Similar results hold for the REINFORCE estimator.

The effect of depth:

It turns out that the slope-annealed straight-through estimator is resilient to depth, even at a reasonable learning rate. The REINFORCE estimator, on the other hand, starts to fail as depth is introduced. However, if we lower the learning rate dramatically (25x), we can start to get the deeper networks to train with the REINFORCE estimator.

res1 = train_classifier(hidden_dims = [200], epochs=20, train_set=mnist.validation, val_set=mnist.test,
                        lr = 0.03, non_binary = True, label = "Deterministic sigmoid net")

res2 = train_classifier(hidden_dims = [200], epochs=20, stochastic_eval=False, train_set=mnist.validation,
                        val_set=mnist.test, slope_annealing_rate=1.1, estimator=StochasticGradientEstimator.ST,
                        lr = 0.3, label = "Binary stochastic net")

plot_n(res1 + res2, lower_y=0.8, title="Experiment 8: Using binary stochastic neurons as a regularizer")
Epoch 20 0.9306
Epoch 20 0.9435

png

Conclusion:

I skipped some experiments because they don’t seem relevant. Any way in this post, it shows that we can improve upon the performance of an overfitting multi-layer sigmoid net by turning its neurons binary stochastic neurons with a straight-through estimator. And that slope-annealed straight through estimator is better than other straight through variants, and that it is worth using the variance-adjusted REINFORCE estimator over the not variance-adjusted REINFORCE estimator.

Original notes.

Commented code for bsn implemented in tensorflow.