Tensorflow Deep Neural Network Logistic Regression on MNIST

Posted 2018-10-21

Goal:

Build a deep neural network (in this case 1 hidden layer beyond logistic regression) using TensorFlow to classify handwritten digits [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Performance benchmarks of different approaches: https://en.wikipedia.org/wiki/MNIST_database#Classifiers

Reproduction from AIND Deep Learning: https://www.udacity.com/course/deep-learning-nanodegree--nd101

MNIST data set: https://www.tensorflow.org/api_docs/python/tf/keras/datasets/mnist

In [1]:
import math

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

%matplotlib inline
/anaconda3/envs/aind-dl/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)

Import MNIST dataset

In [2]:
def load_data():
    """Gets MNIST data. https://www.tensorflow.org/api_docs/python/tf/keras/datasets/mnist
    
    Returns:
        X_train (ndarray): training set data
        Y_train (ndarray): training set ground truth labels
        X_test (ndarray): test set data
        Y_test (ndarray): test set ground truth labels
    """
    mnist = tf.keras.datasets.mnist
    (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
    print('-----Training Set Dimensions-----')
    print(X_train.shape)
    print(Y_train.shape)
    print('\n-----Test Set Dimensions-----')
    print(X_test.shape)
    print(Y_test.shape)
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = load_data()
-----Training Set Dimensions-----
(60000, 28, 28)
(60000,)

-----Test Set Dimensions-----
(10000, 28, 28)
(10000,)

Samples of the 28x28 input data

In [3]:
def show_sample_data(rows, cols):
    """Visualizes individual sample observerations in a grid.
    
    Args:
        rows (int): rows in grid
        cols (int): columns in grid
    """
    fig, axes = plt.subplots(rows, cols, figsize=(rows * 1.5, cols * 1.5))
    for i in range(rows):
        for j in range(cols):
            idx = np.random.randint(len(Y_train))
            axes[i,j].imshow(X_train[idx], cmap='Greys')
            axes[i,j].set_title(('Label:{:d}'.format(Y_train[idx])))
            axes[i,j].set_axis_off()

show_sample_data(4, 4)

Preprocess the data

In [4]:
input_dim = X_train[0].flatten().shape[0]
n_classes = len(np.unique(Y_train))

# Flatten 2D input data X_train and X_test into 1D vector
X_train = X_train.reshape([-1, input_dim])
X_test = X_test.reshape([-1, input_dim])

# Transform labels Y_train and Y_test to one hot encoding
Y_train_onehot = np.zeros([Y_train.shape[0], n_classes])
for i in range(Y_train.shape[0]):
    Y_train_onehot[i][Y_train[i]] = 1
Y_train = Y_train_onehot

Y_test_onehot = np.zeros([Y_test.shape[0], n_classes])
for i in range(Y_test.shape[0]):
    Y_test_onehot[i][Y_test[i]] = 1
Y_test = Y_test_onehot

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
(60000, 784) (10000, 784) (60000, 10) (10000, 10)

Build Neural Network Model

Neural Network Model

In [5]:
# Model Hyperparameters
learning_rate = 0.001
layer1_hidden_units = 800
train_keep_rate = 0.8
test_keep_rate = 1.0
In [6]:
tf.reset_default_graph()

# Training data / label ground truth placeholders
X = tf.placeholder(tf.float32, [None, input_dim], name='input_X')
Y = tf.placeholder(tf.float32, [None, n_classes], name='output_Yhat')
keep_rate = tf.placeholder(tf.float32)

# Weights to train
w = {
    'layer_1_w': tf.Variable(tf.truncated_normal([input_dim, layer1_hidden_units]), name='layer_1_w'),
    'layer_2_w': tf.Variable(tf.truncated_normal([layer1_hidden_units, n_classes]), name='layer_2_w'),
}
b = {
    'layer_1_b': tf.Variable(tf.truncated_normal([layer1_hidden_units]), name='layer_1_b'),
    'layer_2_b': tf.Variable(tf.truncated_normal([n_classes]), name='layer_2_b'),
}

Build Neural Network Graph

In [7]:
# Training Hyperparameters
batch_size = 128
epochs = 50
display_epoch_step = 5
kFolds = 5
In [8]:
# Build the graph
with tf.name_scope('layer_1'):
    layer_1 = tf.add(tf.matmul(X, w['layer_1_w']), b['layer_1_b'])
    layer_1 = tf.nn.relu(layer_1, name='layer_1_relu')
    layer_1 = tf.nn.dropout(layer_1, keep_rate)

with tf.name_scope('layer_2'):
    Yhat_logits = tf.add(tf.matmul(layer_1, w['layer_2_w']), b['layer_2_b'])
In [9]:
# Loss function
train_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=Yhat_logits), 
                            name='train_loss')
In [10]:
# Optimizer 
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(train_loss)

Helper Functions

In [11]:
def _accuracy():
    """Calculate accuracy based on Yhat_logits.
    
    Returns:
        (float): ratio of values that matched ground truth labels
    """
    correct_predictions = tf.equal(tf.argmax(Yhat_logits, axis=1), tf.argmax(Y, axis=1))
    return tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
In [12]:
def _cross_validate(X_train, Y_train, kFolds):
    """Generator that splits training set into training and validation.  Done via cross-validation.
    
    Args:
        X_train (ndarray): training data 
        Y_train (ndarray): training labels
        kFolds (int): how many folds to split the data set into
    
    Yields:
        X_train[train_idx] (ndarray): training data fold
        Y_train[train_idx] (ndarray): training label fold
        X_train[test_idx] (ndarray): validation data fold
        Y_train[test_idx] (ndarray): validation label fold
    """
    kf = KFold(n_splits=kFolds)
    for train_idx, test_idx in kf.split(X_train):
        yield X_train[train_idx], Y_train[train_idx], X_train[test_idx], Y_train[test_idx]
In [13]:
def _batched(X_train, Y_train, batch_size):
    """Generators that splits training data into batches.
    
    Args:
        X_train (ndarray): training data 
        Y_train (ndarray): training labels
        batch_size (int): batch size
    
    Yields:
        X_train[idx:idx+batch_size] (ndarray): next batch of training data
        Y_train[idx:idx+batch_size] (ndarray): next batch of training label data
    """
    idx = 0
    while idx < len(X_train):
        if len(X_train) - idx < batch_size:
            yield X_train[idx:], Y_train[idx:]
        else:
            yield X_train[idx:idx+batch_size], Y_train[idx:idx+batch_size]
        idx += batch_size

Train the Model (main loop)

In [14]:
# Run the model
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)

model_train_losses = []
model_train_acc = []
model_valid_losses = []
model_valid_acc = []
model_test_acc = []

for epoch in tqdm(range(epochs)):
    # Split training data into KFolds
    if epoch % kFolds == 0:
        kFolded = _cross_validate(X_train, Y_train, kFolds)
    X_train_fold, Y_train_fold, X_valid_fold, Y_valid_fold = next(kFolded)
    
    # Train using batches
    total_batches = math.ceil(X_train_fold.shape[0] // batch_size)
    batched_data = _batched(X_train_fold, Y_train_fold, batch_size)
    for batch in range(total_batches):
        batch_X_train, batch_Y_train = next(batched_data)
        sess.run(optimizer, feed_dict={X: batch_X_train, Y: batch_Y_train, keep_rate: train_keep_rate})
    
    # Record model performance at each epoch
    e_train_loss, e_train_acc = sess.run([train_loss, _accuracy()], feed_dict={X: X_train, Y: Y_train, keep_rate: train_keep_rate})
    e_valid_loss, e_valid_acc = sess.run([train_loss, _accuracy()], feed_dict={X: X_valid_fold, Y: Y_valid_fold, keep_rate: test_keep_rate})
    e_test_acc = sess.run(_accuracy(), feed_dict={X: X_test, Y: Y_test, keep_rate: test_keep_rate})
    model_train_losses.append(e_train_loss)
    model_train_acc.append(e_train_acc)
    model_valid_losses.append(e_valid_loss)
    model_valid_acc.append(e_valid_acc)
    model_test_acc.append(e_test_acc)

    if epoch % display_epoch_step == 0:    
        # Display during training
        print('-----Epoch: {}-----'.format(epoch+1))
        print('tr_loss\t\t tr_acc \t v_loss\t\t v_acc\t\t test_acc')
        print('{0}\t {1:.4f}\t\t {2:.4f}\t {3:.4f}\t\t {4:.4f}\t\t'
              .format(str(e_train_loss), e_train_acc, e_valid_loss, e_valid_acc, e_test_acc))
    
sess.close()
-----Epoch: 1-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
3976.0513	 0.8179		 2230.8972	 0.8870		 0.8874		
-----Epoch: 6-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
800.1309	 0.9331		 476.3055	 0.9596		 0.9507		
-----Epoch: 11-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
327.8263	 0.9570		 179.5950	 0.9762		 0.9626		
-----Epoch: 16-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
151.67596	 0.9715		 84.0754	 0.9846		 0.9660		
-----Epoch: 21-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
85.37141	 0.9795		 29.5893	 0.9913		 0.9689		
-----Epoch: 26-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
50.999863	 0.9854		 11.8225	 0.9955		 0.9708		
-----Epoch: 31-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
32.42161	 0.9890		 5.6869	 0.9968		 0.9726		
-----Epoch: 36-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
23.166948	 0.9914		 2.2607	 0.9984		 0.9718		
-----Epoch: 41-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
17.396715	 0.9926		 1.5225	 0.9987		 0.9738		
-----Epoch: 46-----
tr_loss		 tr_acc 	 v_loss		 v_acc		 test_acc
17.490263	 0.9934		 1.3632	 0.9989		 0.9751		

Visualize Performance

In [16]:
plt.figure(figsize=(10,8))
plt.plot(model_train_acc, label='Training Max {0:.4f}'.format(max(model_train_acc)))
plt.plot(model_valid_acc, label='Validation Max {0:.4f}'.format(max(model_valid_acc)))
plt.plot(model_test_acc, label='Test Max {0:.4f}'.format(max(model_test_acc)))
plt.axhline(y=0.1, label='Random Guessing', linestyle='-.')

plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.ylim([0.85,1])
plt.legend()
Out[16]:
<matplotlib.legend.Legend at 0x1a2b73d550>
In [17]:
plt.figure(figsize=(10,8))
plt.plot(model_train_losses, label='Training')
plt.plot(model_valid_losses, label='Validation')

plt.title('Model Losses')
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.ylim([0, 200])
plt.legend()
Out[17]:
<matplotlib.legend.Legend at 0x1a2b75af98>

Results

  • Though MNIST is an easy data set (all observations are the same size input, centered, greyscale, no weird orientations), we managed to achieve <2.4% error with just 1 hidden layer, which is just a little off the 1.6% error rate recorded here for 2 layer NN. https://en.wikipedia.org/wiki/MNIST_database#Classifiers
  • Note that by flattening the 2D vector into 1D we lose relationships in 2D space
  • Dropouts really help with regularization
In [ ]: