tensorflow gradient-descent multilabel-classification

Multilabel Classification with Tensorflow

I have the code below for a multilabel classification:

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split

X, Y = make_multilabel_classification(n_samples=10000, n_features=200, n_classes=10, n_labels=2,
                                      allow_unlabeled=False, random_state=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

#.........................................................................
learning_rate = 0.001
training_epochs = 5000
display_step = 50

num_input = x_train.shape[1]
num_classes = y_train.shape[1]

def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def model(X, w_h, w_h2, w_o, p_keep_input, p_keep_hidden): 
    X = tf.nn.dropout(X, p_keep_input)

    h = tf.nn.relu(tf.matmul(X, w_h))
    h = tf.nn.dropout(h, p_keep_hidden)

    h2 = tf.nn.relu(tf.matmul(h, w_h2))
    h2 = tf.nn.dropout(h2, p_keep_hidden)

    h3 = tf.nn.relu(tf.matmul(h2, w_h3))
    h3 = tf.nn.dropout(h3, p_keep_hidden)

    return tf.nn.sigmoid(tf.matmul(h3, w_o))

x = tf.placeholder("float", [None, num_input])
y = tf.placeholder("float", [None, num_classes])

w_h = init_weights([num_input, 500])
w_h2 = init_weights([500, 500])
w_h3 = init_weights([500, 500])
w_o = init_weights([500, num_classes])

p_keep_input = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")
pred = model(x, w_h, w_h2, w_o, p_keep_input, p_keep_hidden)

#cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y))

cost = -tf.reduce_sum( (  (y*tf.log(pred + 1e-9)) + ((1-y) * tf.log(1 - pred + 1e-9)) )  , name='xentropy' )

optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)
#optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)

correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

#--------------------------------------------------------------------------------
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    sess.run(tf.local_variables_initializer())

    for epoch in range(training_epochs):
        sess.run(optimizer, feed_dict = {x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})
        avg_cost = sess.run(cost, feed_dict = {x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})

        if epoch % display_step == 0:
            training_acc = accuracy.eval({x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})
            print("Epoch:", '%03d' % (epoch), "Training Accuracy:", '%.5f' % (training_acc), "cost=", "{:.10f}".format(avg_cost))

    print("Optimization Complete!")

    a = tf.cast(tf.argmax(pred, 1),tf.float32)
    b = tf.cast(tf.argmax(y,1),tf.float32)

    roc_score = tf.metrics.auc(b, a)
    cm = tf.confusion_matrix(b, a)

    sess.run(tf.local_variables_initializer())
    print(sess.run(cm, feed_dict={x : x_test, y : y_test, p_keep_input: 1.0, p_keep_hidden: 1.0}))
    print(sess.run(roc_score, feed_dict={x : x_test, y : y_test, p_keep_input: 1.0, p_keep_hidden: 1.0}))

And the output is below:

Epoch: 000 Training Accuracy: 0.31500 cost= 62297.6406250000
Epoch: 050 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 100 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 150 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 200 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 250 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 300 Training Accuracy: 0.30722 cost= 433502.8125000000
Epoch: 350 Training Accuracy: 0.30722 cost= 433502.8125000000
...
Epoch: 5000 Training Accuracy: 0.30722 cost= 433502.8125000000

As above, the training accuracy remains same almost all through the training process. I varied the number of hidden layers and learning rate from 0.001, 0.01 to 0.1 and the trend was still same.

I'd appreciate some help on what I may be doing wrong.

Solution

The main problem with your code is that you are not using mini-batch gradient descent, and instead you are using the whole training data for each gradient descent update. Additionally 5000 epochs is too many I think, and I guess 50-100 will be enough (you can verify by experiment). Also at the following lines, the second one is redundant and in fact you are running the graph two times in each iteration while you want to do this once:

sess.run(optimizer, feed_dict = {x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})
avg_cost = sess.run(cost, feed_dict = {x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})

Correct form:

_, avg_cost= sess.run([optimizer,cost], feed_dict = {x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})

The following is modified code( There is comment at the of the lines that I have added # ADDED #):

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split

X, Y = make_multilabel_classification(n_samples=10000, n_features=200, n_classes=10, n_labels=2,
                                  allow_unlabeled=False, random_state=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

batch_size = 100 # ADDED #
num_batches = x_train.shape[0]/batch_size # ADDED #

learning_rate = 0.001
training_epochs = 5000
display_step = 1

num_input = x_train.shape[1]
num_classes = y_train.shape[1]

def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def model(X, w_h, w_h2, w_o, p_keep_input, p_keep_hidden): 
    X = tf.nn.dropout(X, p_keep_input)

    h = tf.nn.relu(tf.matmul(X, w_h))
    h = tf.nn.dropout(h, p_keep_hidden)

    h2 = tf.nn.relu(tf.matmul(h, w_h2))
    h2 = tf.nn.dropout(h2, p_keep_hidden)

    h3 = tf.nn.relu(tf.matmul(h2, w_h3))
    h3 = tf.nn.dropout(h3, p_keep_hidden)

    return tf.nn.sigmoid(tf.matmul(h3, w_o))

x = tf.placeholder("float", [None, num_input])
y = tf.placeholder("float", [None, num_classes])

w_h = init_weights([num_input, 500])
w_h2 = init_weights([500, 500])
w_h3 = init_weights([500, 500])
w_o = init_weights([500, num_classes])

p_keep_input = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")
pred = model(x, w_h, w_h2, w_o, p_keep_input, p_keep_hidden)
cost = -tf.reduce_sum( (  (y*tf.log(pred + 1e-9)) + ((1-y) * tf.log(1 - pred + 1e-9)) )  , name='xentropy' )

optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)

correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    sess.run(tf.local_variables_initializer())

    for epoch in range(training_epochs):

        for i in xrange(num_batches):# ADDED #

            indices = xrange(i*batch_size, (i+1)*batch_size)# ADDED #
           _, avg_cost= sess.run([optimizer,cost], feed_dict = {x : x_train[indices], y : y_train[indices], p_keep_input: 1.0, p_keep_hidden: 1.0})# ADDED #

        if epoch % display_step == 0:
            training_acc = accuracy.eval({x : x_train, y : y_train, p_keep_input: 1.0, p_keep_hidden: 1.0})
            print("Epoch:", '%03d' % (epoch), "Training Accuracy:", '%.5f' % (training_acc), "cost=", "{:.10f}".format(avg_cost))

    print("Optimization Complete!")

    a = tf.cast(tf.argmax(pred, 1),tf.float32)
    b = tf.cast(tf.argmax(y,1),tf.float32)

    roc_score = tf.metrics.auc(b, a)
    cm = tf.confusion_matrix(b, a)

    sess.run(tf.local_variables_initializer())
    print(sess.run(cm, feed_dict={x : x_test, y : y_test, p_keep_input: 1.0, p_keep_hidden: 1.0}))
    print(sess.run(roc_score, feed_dict={x : x_test, y : y_test, p_keep_input: 1.0, p_keep_hidden: 1.0}))