This is an implementation of various-length RNN with single layer GRU model. Due to time limitation, it only showcases the basics of RNN model.

import pandas as pd, numpy as np, tensorflow as tf
import blogs_data #available at https://github`.com/spitis/blogs_data
/home/everitt257/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
df = blogs_data.loadBlogs().sample(frac=1).reset_index(drop=True)
vocab, reverse_vocab = blogs_data.loadVocab()
train_len, test_len = np.floor(len(df)*0.8), np.floor(len(df)*0.2)
train, test = df.loc[:train_len-1], df.loc[train_len:train_len + test_len]
df = None
train.head()
post_id gender age_bracket string as_numbers length
0 144744 1 0 we listened to this creepy music we all were s... [32, 1968, 5, 29, 3623, 344, 32, 37, 88, 942, ... 30
1 84957 1 1 when a person <UNK> , the throat closes to pre... [56, 7, 211, 0, 1, 4, 2379, 8457, 5, 3071, 443... 15
2 134300 1 0 <UNK> ... guess those that stayed back in clas... [0, 24, 228, 161, 9, 1024, 93, 11, 320, 66, 64... 14
3 11751 1 0 speaking of money i got my atm card fixed today ! [973, 8, 314, 3, 89, 13, 7210, 983, 2062, 119,... 11
4 126685 0 1 as of now , around <#> hours from her phone ca... [38, 8, 68, 1, 146, 12, 309, 57, 61, 397, 260,... 18
class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()
    
    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True) # not sure why reset_index is used at here
        self.cursor = 0
    
    def next_batch(self, n):
        if self.cursor+n-1 > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.loc[self.cursor: self.cursor+n-1]
        self.cursor += n
        return res['as_numbers'], res['gender']*3 + res['age_bracket'], res['length']
data = SimpleDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')
print('Target values\n', d[1], end='\n\n')
print('Sequence lengths\n', d[2])
Input sequences
 0    [0, 1, 0, 49, 0, 50, 200, 9, 465, 19, 2514, 13...
1    [723, 52, 153, 30, 771, 33, 2145, 33, 4073, 79...
2    [6, 1863, 14, 13, 2678, 2482, 32, 97, 843, 0, ...
Name: as_numbers, dtype: object

Target values
 0    0
1    3
2    3
dtype: int64

Sequence lengths
 0    16
1    11
2    26
Name: length, dtype: int64

Problem

The three sequences are of different length.

Solution

Pad different length sequences into the same length so the can be fit into the same tensor.

class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.loc[self.cursor: self.cursor+n-1]
        self.cursor += n
    
        # Pad the various sequences with zeroes to make them the same length
        maxlen = max(res['length'])
        x = np.zeros([n, maxlen], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[:res['length'].values[i]] = res['as_numbers'].values[i]
        
        return x, res['gender']*3 + res['age_bracket'], res['length']
data = PaddedDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n', d[0], end='\n\n')
print('Target values\n', d[1], end='\n\n')
print('Sequence lengths\n', d[2])
Input sequences
 [[ 286    1 5364   42  382  153   80   15  743  116    7 2925  742    1
    10   22   34   40   36  229   15    4 1819    8 2925   50]
 [   6   65   19  289 6197   42 5973    5  771    6 2708  151   25    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [ 386    0  390 1422   13  213 1079   16   61  382   54  474   40   12
     6    3  429 1213 5687    0 8235   14  800   25    0    0]]

Target values
 0    3
1    3
2    0
dtype: int64

Sequence lengths
 0    26
1    13
2    24
Name: length, dtype: int64

Basic model for sequence classification

Have the model guess the outcome at the very last step

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def build_graph(
    vocab_size = len(vocab),
    state_size = 64,
    batch_size = 256,
    num_classes = 6):
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, None]
    seqlen = tf.placeholder(tf.int32, [batch_size]) #[batch_size]
    y = tf.placeholder(tf.int32, [batch_size])
    keep_prob = tf.placeholder(tf.float32, [])
    
    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x) #[batch_size, None, state_size]
    
    # RNN with GRU
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    # Easier this way: init_state = cell.zero_state(batch_size, tf.float32) 
    init_state = tf.get_variable('init', [1, state_size]) #[1, state_size]
    init_state = tf.tile(init_state, [batch_size, 1]) # replicate 256 piece of it
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen, initial_state=init_state)
    
    # rnn_outputs = [batch_size, None, state_size], final_state = [batch_size, 1, state_size]
    # It's actually a single layer gru with dropout
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)
    
    # obtain the last relevant outputs, shape=(256, 64), note taht seqlen is of different sizes
    last_rnn_output = tf.gather_nd(rnn_outputs, tf.stack([tf.range(batch_size), seqlen-1], axis=1))
    
    # adds a softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes], initializer=tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits) # shape=(batch_size, num_classes)
    correct = tf.equal(tf.cast(tf.argmax(preds, 1), tf.int32), y) # shape=(batch_size,)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
    
    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

def train_graph(graph, batch_size = 256, num_epochs = 10, iterator = PaddedDataIterator, dropout = 0.6):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        te = iterator(test)
        
        step, accuracy = 0, 0
        tr_losses, te_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: dropout}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_
            
            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                # reset for evaluation
                step, accuracy = 0, 0
                # eval test set
                te_epoch = te.epochs
                while te.epochs == te_epoch:
                    step += 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 1.0}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_
                    
                te_losses.append(accuracy / step)
                # reset after the evaluation, to continue to evaluate training accuracy on next epoch
                step, accuracy = 0, 0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
    
    return tr_losses, te_losses

Basic test

g = build_graph()
tr_losses, te_losses = train_graph(g)
Accuracy after epoch 1  - tr: 0.3160342651213897 - te: 0.34959466527196653
Accuracy after epoch 2  - tr: 0.354831745865606 - te: 0.35830192629815744
Accuracy after epoch 3  - tr: 0.3617226750575675 - te: 0.35916234819932996
Accuracy after epoch 4  - tr: 0.36376125183169356 - te: 0.35978394577051925
Accuracy after epoch 5  - tr: 0.3654105937303747 - te: 0.3596530831239531
Accuracy after epoch 6  - tr: 0.36659465276324055 - te: 0.36046443153266333
Accuracy after epoch 7  - tr: 0.36802484561440235 - te: 0.3609159076633166
Accuracy after epoch 8  - tr: 0.36934382850115133 - te: 0.361710898241206
Accuracy after epoch 9  - tr: 0.37092448189240107 - te: 0.36232922424623115
Accuracy after epoch 10  - tr: 0.37257627695206197 - te: 0.36191373534338356

Improving with bucketing

Since there are a lot of zeros in the sequences, let’s calculate the average padding of zeros.

tr = PaddedDataIterator(train)
padding = 0
for i in range(100):
    lengths = tr.next_batch(256)[2].values
    max_len = max(lengths)
    padding += np.sum(max_len - lengths)
print("Average padding / batch:", padding/100)
Average padding / batch: 3291.11
class BucketDataIterator():
    def __init__(self, df, num_buckets = 5):
        df = df.sort_values('length').reset_index(drop=True)
        self.size = len(df)/num_buckets # each bucket's size
        self.dfs = []
        for bucket in range(num_buckets):
            self.dfs.append(df.loc[bucket*self.size: (bucket+1)*self.size - 1])
            
        self.num_buckets = num_buckets
        
        self.cursor = np.array([0] * num_buckets)
        self.shuffle()
        self.epochs = 0
        
    def shuffle(self):
        for i in range(self.num_buckets):
            # sorts dataframe by sequence length, but keeps it random within the same length
            self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
            self.cursor[i] = 0
            
    def next_batch(self, n):
        # this part acts as overwatch for batch reaching the end of a small df
        if np.any(self.cursor+n+1 > self.size):
            self.epochs += 1
            self.shuffle()
        
        i = np.random.randint(0, self.num_buckets)
        res = self.dfs[i].loc[self.cursor[i]: self.cursor[i]+n-1]
        self.cursor[i] += n
        # Pad sequences with 0s so they are all the same length
        maxlen = max(res['length'])
        x = np.zeros([n, maxlen], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[:res['length'].values[i]] = res['as_numbers'].values[i]

        return x, res['gender']*3 + res['age_bracket'], res['length']
tr = BucketDataIterator(df=train, num_buckets=5)
padding = 0
for i in range(100):
    lengths = tr.next_batch(256)[2].values
    max_len = max(lengths)
    padding += np.sum(max_len - lengths)
print("Average padding / batch:", padding/100)
Average padding / batch: 583.76
from time import time
g = build_graph()
t = time()
tr_losses, te_losses = train_graph(g, num_epochs=1, iterator=PaddedDataIterator)
print("Total time for 1 epoch with PaddedDataIterator:", time() - t)
WARNING:tensorflow:From /home/everitt257/anaconda3/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
Accuracy after epoch 1  - tr: 0.3123986239012139 - te: 0.3490258891213389
Total time for 1 epoch with PaddedDataIterator: 380.0432941913605
g = build_graph()
t = time()
tr_losses, te_losses = train_graph(g, num_epochs=1, iterator=BucketDataIterator)
print("Total time for 1 epoch with BucketedDataIterator:", time() - t)
Accuracy after epoch 1  - tr: 0.31257560483870966 - te: 0.34867950074701193
Total time for 1 epoch with BucketedDataIterator: 316.3720586299896

Basic sequence to sequence learning

Have the model guess at every step!

def build_seq2seq_graph(
    vocab_size = len(vocab),
    state_size = 64,
    batch_size = 256,
    num_classes = 6):
    
    reset_graph()
    
    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])
    keep_prob = tf.placeholder(tf.float32, [])
    
    # Tile the target indices
    y_ = tf.tile(tf.expand_dims(y, 1), [1, tf.shape(x)[1]]) # [batch_size, num_steps]
    
    lower_triangular_ones = tf.constant(np.tril(np.ones([30,30])),dtype=tf.float32) # since 30 is of maximum length
    # tf.gather returns [batch_size, 30], tf.slice returns [batch_size, max(length of current batch)]
    seqlen_mask = tf.slice(tf.gather(lower_triangular_ones, seqlen-1),\
                           [0, 0], [batch_size, tf.reduce_max(seqlen)])
    
    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)
    
    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, 
                                                 sequence_length=seqlen, 
                                                 initial_state=init_state)
    
    # Adds dropout
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)
    
    # Reshape rnn_outputs and y
    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
    y_reshaped = tf.reshape(y_, [-1])
    
    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(rnn_outputs, W) + b
    
    preds = tf.nn.softmax(logits)
    
    # Calculate 
    correct = tf.cast(tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y_reshaped),tf.int32) *\
                tf.cast(tf.reshape(seqlen_mask, [-1]),tf.int32)
        
    # To calculate accuracy we want to divide by the number of non-padded time-steps,
    # rather than taking the mean
    accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / tf.reduce_sum(tf.cast(seqlen, tf.float32))
    
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = loss * tf.reshape(seqlen_mask, [-1])
    
    # To calculate average loss, we need to divide by number of non-padded time-steps,
    # rather than taking the mean
    loss = tf.reduce_sum(loss) / tf.reduce_sum(seqlen_mask)

    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

Test seq2seq

Should behave worst than sequence classification.

g = build_seq2seq_graph()
tr_losses, te_losses = train_graph(g, iterator=BucketDataIterator)
Accuracy after epoch 1  - tr: 0.29417641936604116 - te: 0.3165940021659805
Accuracy after epoch 2  - tr: 0.32053691482279945 - te: 0.32140361183230964
Accuracy after epoch 3  - tr: 0.3248470337743139 - te: 0.3230788176369774
Accuracy after epoch 4  - tr: 0.32654755128969826 - te: 0.32458225125352463
Accuracy after epoch 5  - tr: 0.3278110629056041 - te: 0.3248658520595809
Accuracy after epoch 6  - tr: 0.329185960110063 - te: 0.32527022973785763
Accuracy after epoch 7  - tr: 0.33046067700309273 - te: 0.3260545640350034
Accuracy after epoch 8  - tr: 0.33134092020356276 - te: 0.32620091709176663
Accuracy after epoch 9  - tr: 0.332484958309292 - te: 0.32587733625728177
Accuracy after epoch 10  - tr: 0.3326295395103957 - te: 0.3262810798193513