Search code examples
pythontensorflowword-embeddingtfrecordtf.data.dataset

extracting numpy value from tensorflow object during transformation


i am trying to get word embeddings using tensorflow, and i have created adjacent work lists using my corpus.

Number of unique words in my vocab are 8000 and number of adjacent word lists are around 1.6 million

Word Lists sample photo

Since the data is very large i am trying to write the word lists in batches to TFRecords file.

def save_tfrecords_wordlist(toprocess_word_lists, path ):    
    writer = tf.io.TFRecordWriter(path)

    for word_list in toprocess_word_lists:
        features=tf.train.Features(
            feature={
        'word_list_X': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[0].encode('utf-8')] )),
        'word_list_Y': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[1].encode('utf-8') ]))
                }
            )
        example = tf.train.Example(features = features)
        writer.write(example.SerializeToString())
    writer.close()

defining batches

batches = [0,250000,500000,750000,1000000,1250000,1500000,1641790]

for i in range(len(batches) - 1 ):

    batches_start = batches[i]
    batches_end = batches[i + 1]
    print( str(batches_start) + " -- " + str(batches_end ))

    toprocess_word_lists = word_lists[batches_start:batches_end]
    save_tfrecords_wordlist( toprocess_word_lists, path +"/TFRecords/data_" + str(i) +".tfrecords")

##############################

def _parse_function(example_proto):

  features = {"word_list_X": tf.io.FixedLenFeature((), tf.string),
          "word_list_Y": tf.io.FixedLenFeature((), tf.string)}
  parsed_features = tf.io.parse_single_example(example_proto, features)

  """
  word_list_X  = parsed_features['word_list_X'].numpy()
  word_list_Y  = parsed_features['word_list_Y'].numpy()

  ## need help is getting the numpy values from parsed_features variable so that i can get the one hot encoding matrix     which can be directly sent to tensorflow for training

  sample word_list_X value is <tf.Tensor: shape=(10,), dtype=string,   numpy=array([b'for', b'for', b'for', b'you', b'you', b'you', b'you', b'to',b'to', b'to'], dtype=object)>
  sample word_list_Y value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'is', b'to', b'recommend', b'to', b'for', b'contact', b'is',b'contact', b'you', b'the'], dtype=object)>)

  """
  return parsed_features['word_list_X'],parsed_features['word_list_Y']

filenames = [ path + "/JustEat_TFRecords/data.tfrecords" ]
dataset = tf.data.TFRecordDataset(filenames)

dataset = dataset.map(_parse_function)
dataset = dataset.batch(10)

# Defining the size of the embedding
embed_size = 100

# Defining the neural network
inp = tf.keras.Input(shape=(7958,))
x = tf.keras.layers.Dense(units=embed_size, activation='linear')(inp)
x = tf.keras.layers.Dense(units=7958, activation='softmax')(x)

model =  tf.keras.Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Optimizing the network weights
#model.fit( x=X, y=Y, batch_size=256,epochs= 100)
model.fit(dataset,epochs= 2)

Solution

  • It appears that you can't call the .numpy() function from inside the mapping function (1, 2) although i was able to manage by using the py_function from (doc).

    On the example below i have mapped my parsed dataset to a function that converts my images to np.uint8 in order to plot them using matplotlib.

    records_path = data_directory+'TFRecords'+'/data_0.tfrecord'
    # Create a dataset
    dataset = tf.data.TFRecordDataset(filenames=records_path)
    # Map our dataset to the parsing function 
    parsed_dataset = dataset.map(parsing_fn)
    converted_dataset = parsed_dataset.map(lambda image,label:
                                           tf.py_function(func=converting_function,
                                                          inp=[image,label],
                                                          Tout=[np.uint8,tf.int64]))
    
    # Gets the iterator
    iterator = tf.compat.v1.data.make_one_shot_iterator(converted_dataset) 
    
    for i in range(5):
        image,label = iterator.get_next()
        plt.imshow(image)
        plt.show()
        print('label: ', label)
    

    Output:

    enter image description here

    Parsing Function:

    def parsing_fn(serialized):
        # Define a dict with the data-names and types we expect to
        # find in the TFRecords file.
        features = \
            {
                'image': tf.io.FixedLenFeature([], tf.string),
                'label': tf.io.FixedLenFeature([], tf.int64)            
            }
    
        # Parse the serialized data so we get a dict with our data.
        parsed_example = tf.io.parse_single_example(serialized=serialized,
                                                 features=features)
        # Get the image as raw bytes.
        image_raw = parsed_example['image']
    
        # Decode the raw bytes so it becomes a tensor with type.
        image = tf.io.decode_jpeg(image_raw)
        
        # Get the label associated with the image.
        label = parsed_example['label']
        
        # The image and label are now correct TensorFlow types.
        return image, label
    

    Related issue: TF.data.dataset.map(map_func) with Eager Mode

    Update: Didn't actually checked out but tf.shape() seems also to be a promising alternative.