Search code examples
neural-networktensorflow

Categorical Variables in TensorFlow


i'm trying to use TensorFlow on a dataset with has a few Categorical variables. I've encoded them with dummies but it looks like its causing trouble and TF is complaining that the dataset is not dense.

Or is the reason for the error something totally different ?

I'm trying to run a simple Neural Network Model with 1 hidden layer with stochastic gradient. The code was working when the input was numeric variables (images of digits from MNIST)

-------------------------------------------------------------------------- 
ValueError                                Traceback (most recent call last) <ipython-input-473-7517101e1879> in <module>()
     37     return(test_acc,round(l,5))
     38 
---> 39 define_batch(0.005)
     40 run_batch()

<ipython-input-472-48b4e30f8e9e> in define_batch(beta)
     11                                         shape=(batch_size, num_var))
     12       tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
---> 13       tf_valid_dataset = tf.constant(valid_dataset)
     14       tf_test_dataset = tf.constant(test_dataset)
     15 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/tensorflow/python/ops/constant_op.pyc in constant(value, dtype, shape, name)
    159   tensor_value = attr_value_pb2.AttrValue()
    160   tensor_value.tensor.CopyFrom(
--> 161       tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape))
    162   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
    163   const_tensor = g.create_op(

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.pyc in make_tensor_proto(values, dtype, shape)
    320       nparray = np.array(values, dtype=np_dt)
    321       if list(nparray.shape) != _GetDenseDimensions(values):
--> 322         raise ValueError("Argument must be a dense tensor: %s" % values)
    323     # python/numpy default float type is float64. We prefer float32 instead.
    324     if (nparray.dtype == np.float64) and dtype is None:

ValueError: Argument must be a dense tensor:         Tuesday  Wednesday  Thursday  Friday  Saturday  Sunday  CENTRAL  \ 736114     
0.0        0.0       0.0     0.0       1.0     0.0      0.0    437148      0.0        0.0       1.0     0.0       0.0     0.0      0.0    605041      0.0        0.0       0.0     0.0       0.0     0.0      0.0    444608      0.0        0.0       0.0     0.0       1.0     0.0      0.0    695549      0.0        0.0       0.0     0.0       1.0     0.0      0.0    662807      0.0        0.0       0.0     1.0       0.0     0.0      0.0    238635      0.0        0.0       0.0     0.0       0.0     1.0      0.0    549524      0.0        0.0       0.0     1.0       0.0     0.0      0.0    705478      1.0        0.0       0.0     0.0       0.0     0.0      0.0    557716      0.0        0.0       0.0     1.0       0.0     0.0      0.0    41808       0.0        0.0       0.0     0.0       0.0     1.0      0.0    227235      1.0        0.0       0.0     0.0       0.0     0.0      0.0    848719      0.0        0.0       0.0     0.0       0.0     0.0      0.0    731202      0.0        0.0       0.0     0.0       1.0     0.0      0.0    467516      1.0        0.0       0.0     0.0       0.0     0.0      1.0

Here is an excerpt from the code

# Adding regularization to the 1 hidden layer network
def define_batch(beta):
    batch_size = 128
    num_RELU =256
    graph1 = tf.Graph()
    with graph1.as_default():

      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, num_var))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
  
      # Variables.
      weights_RELU = tf.Variable(
        tf.truncated_normal([num_var, num_RELU]))
      biases_RELU = tf.Variable(tf.zeros([num_RELU]))
      weights_layer1 = tf.Variable(
        tf.truncated_normal([num_RELU, num_labels]))
      biases_layer1 = tf.Variable(tf.zeros([num_labels]))
  
      # Training computation.
      logits_RELU = tf.matmul(tf_train_dataset, weights_RELU) + biases_RELU
      RELU_vec = tf.nn.relu(logits_RELU)
      logits_layer = tf.matmul(RELU_vec, weights_layer1) + biases_layer1                  
      # loss = tf.reduce_mean(
      #        tf.nn.softmax_cross_entropy_with_logits(logits_layer, tf_train_labels))
      cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits_layer, tf_train_labels,name="cross_entropy")
      l2reg = tf.reduce_sum(tf.square(weights_RELU))+tf.reduce_sum(tf.square(weights_layer1))
      beta = 0.005
      loss = tf.reduce_mean(cross_entropy+beta*l2reg)
  
  # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.3).minimize(loss)
  
      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits_layer)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu((tf.matmul(tf_valid_dataset, weights_RELU) + biases_RELU)),weights_layer1)+biases_layer1)
                         
      test_prediction =tf.nn.softmax(
        tf.matmul(tf.nn.relu((tf.matmul(tf_test_dataset, weights_RELU) + biases_RELU)),weights_layer1)+biases_layer1)
                         
import datetime

startTime = datetime.datetime.now() 

num_steps = 301 # change to 3001

def run_batch():

    with tf.Session(graph=graph1) as session:
 
      tf.initialize_all_variables().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch. 
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions, logits = session.run(
          [optimizer, loss,train_prediction,logits_RELU], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      test_acc = accuracy(test_prediction.eval(), test_labels)
      print("Test accuracy: %.1f%%" % test_acc)

      print('loss=%s' % l)
    x = datetime.datetime.now() - startTime
    print(x)
    return(test_acc,round(l,5))
    
define_batch(0.005)
run_batch()

EDIT: @gdhal thanks for looking at it

train_dataset is a pandas dataframe

train_dataset.columns
Index([u'Tuesday', u'Wednesday', u'Thursday', u'Friday', u'Saturday',
       u'Sunday', u'CENTRAL', u'INGLESIDE', u'MISSION', u'NORTHERN', u'PARK',
       u'RICHMOND', u'SOUTHERN', u'TARAVAL', u'TENDERLOIN', u' 3H - 4H',
       u' 5H - 6H', u' 7H - 8H', u' 9H - 10H', u'11H - 12H', u'13H - 14H',
       u'15H - 16H', u'17H - 18H', u'19H - 20H', u'21H - 22H', u'23H - 0H',
       u'Xnorm', u'Ynorm', u'Hournorm'],
      dtype='object')

all the variables are dummies (taking 0 or 1 values) except the last 3 variables (Xnorm, Ynorm, and Hournorm) which are numerical values normalized to [0,1] interval. valid_dataset and test_dataset have the same format

train_labels is a pandas series

train_labels.describe()

count            790184
unique               39
top       LARCENY/THEFT
freq             157434
Name: Category, dtype: object

valid_labels, and test_labels have the same format


Solution

  • Try feeding a numpy array instead of a pandas dataframe.