I am having an issue with thus piece of code I wrote to perform a custom operation in Tensorflow ( 2.3 ) . The code generally works fine but sometimes throws unexpected errors and exceptions even through it worked fine in other executions with the same inputs.
I have tried to troubleshoot the issue and I'm almost convinced that it is an evaluation dependency issue. I tried to added some dependencies controls but that did not work. Apologies for the little bit lengthy code by I really could not reproduce the issue in a smaller example. Below is my code :
import tensorflow.compat.v1 as tf
myTensor_values = tf.placeholder(dtype=tf.float32)
myTensor_l2_splits = tf.placeholder(dtype=tf.int32)
myTensor_l1_splits = tf.placeholder(dtype=tf.int32)
def innerloop_processing(begin_index , end_index , input1) :
innerloop_counter = begin_index
ta = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False , infer_shape=False )
def innerloop_body(counter , begin_index , end_index , input1 , ta) :
inner_being_index = input1[1][counter]
inner_end_index = input1[1][counter+1]
row = tf.slice(input1[0] , [inner_being_index] , [inner_end_index-inner_being_index])
ta = ta.write(counter-begin_index , row)
counter = counter + 1
return counter , begin_index , end_index , input1 , ta
def innerloop_cond(counter , begin_index , end_index , input1 , ta ) :
return input1[1][counter] < input1[1][end_index] -1 #stop at the next pointer of the l2_splits
results = tf.while_loop(innerloop_cond , innerloop_body , [innerloop_counter , begin_index , end_index , input1 , ta] )
print_resutls = tf.print("this is the component result :" , results[4].stack())
return results[4].stack()
def generateL1Tensor_writeback(start_offest,step,num):
values = tf.TensorArray(tf.int32, size=0, dynamic_size=True, clear_after_read=False , infer_shape=False )
def cond(values , start_offest , num ,counter) :
return counter*step <= num*step
def body(values , start_offest , num ,counter) :
values = values.write(counter,[(counter*step)+start_offest])
counter = counter+1
return values , start_offest , num ,counter
final_values , _ , _ , _ = tf.while_loop(cond,body,[values , start_offest , num , counter])
final = final_values.concat()
#print_line = tf.print(" xxxxx This is the is the split : " , final)
return final
def multiply2n_ragged(tensor1 , tensor2) :
#this function multiplies two ragged tesnsors of rank 2 . the most outer ranks of the two tensros must be equal .
#setting variables and constats
outerloop_counter = tf.constant(0 , dtype=tf.int32)
carry_on = tf.constant(0 , dtype=tf.int32)
taValues = tf.TensorArray(tf.float32, size=0, dynamic_size=True, clear_after_read=False , infer_shape=False )
taL2Splits = tf.TensorArray(tf.int32, size=0, dynamic_size=True, clear_after_read=False , infer_shape=False )
taL1Splits = tf.TensorArray(tf.int32, size=0, dynamic_size=True, clear_after_read=False , infer_shape=False )
taL1Splits = taL1Splits.write(0,[0]) ## required intialization for L1 split only
innerloop_processing_graphed = tf.function(innerloop_processing)
generateL1Tensor_writeback_graphed = tf.function(generateL1Tensor_writeback)
def outerloop_cond(counter,input1,input2 ,taValues ,taL2Splits , taL1Splits , carry_on ) :
value = tf.shape(input1[2])[0]-1
return counter < value ## this is the length of the outermost dimision , stop of this
def outloop_body(counter,input1,input2, taValues ,taL2Splits , taL1Splits , carry_on) :
l1_comp_begin = input1[2][counter] ## this is begin position of the current row in the outer split ( ie. the ith value in the outer row split tensor )
l1_comp_end = input1[2][counter+1] ## this is end position of the current row in the outer split (ie. the ith + 1 value in the outer row split tensor)
l1_comp2_begin = input2[2][counter] ## we do the same for the second components
l1_comp2_end = input2[2][counter+1] ## we do the same for the second components
comp = innerloop_processing_graphed(l1_comp_begin ,l1_comp_end ,input1 ) ## now retrive the data to be procesed for the selected rows from vector1
comp2 =innerloop_processing_graphed(l1_comp2_begin ,l1_comp2_end ,input2 ) ## do the same for vector 2
comp2 = tf.transpose(comp2) ### desired operation
multiply =tf.matmul(comp , comp2) #### This is the desired operation
myshape= tf.shape(multiply) ## calculate the shape of the result in order to prepare to write the result in a ragged tensor format.
offset = tf.cond( taValues.size() >0 ,lambda: tf.shape(taValues.concat())[0] , lambda : [0]) ### this is a hack, TensorArray.concat returns an error if the array is empty. Thus we check before calling this.
l2v = generateL1Tensor_writeback_graphed(offset,myshape[1],myshape[0]) # generate the inner row split of the result for the current element
taL2Splits=taL2Splits.write(counter,l2v) # write back the inner rowlplit to a TensorArray
taValues=taValues.write(counter,tf.reshape(multiply , [-1])) # wirte back the actual ragged tensor elemnts in a another TensorArray
carry_on=carry_on+myshape[0] ## required to calculate the outer row splite
taL1Splits=taL1Splits.write(counter+1,[carry_on]) ## This is the outmost row split.
counter = counter+1
return counter , input1,input2, taValues ,taL2Splits , taL1Splits , carry_on
outerloop_finalcounter , _ , _ , ta1,ta2,ta3,_ = tf.while_loop(outerloop_cond,outloop_body,[outerloop_counter , tensor1 , tensor2 ,taValues ,taL2Splits , taL1Splits,carry_on])
uinquie_ta2 , _ = tf.unique(ta2.concat()) # this is required since some values might be duplicate in the row split itself
final_values = ta1.concat() , uinquie_ta2 ,ta3.concat()
return final_values
t = myTensor_values , myTensor_l2_splits , myTensor_l1_splits
oo =multiply2n_ragged(t,t)
new_oo = multiply2n_ragged(oo,oo)
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
vals =np.array([1.0, 2.2 , 1.1 , 4.0, 5.0 , 1.1 , 6.0, 7.0 , 1.1 , 8.0, 9.0 , 1.1 ,10.0, 11.0 , 1.1 ])
l2_splits = np.array([0,3,6,9,12,15])
l1_splits = np.array([0, 2, 5 ])
re = sess.run([new_oo ] , feed_dict={myTensor_values:vals ,myTensor_l1_splits:l1_splits ,myTensor_l2_splits:l2_splits } )
As I said the code works fine many times , however it some times generates the below errors for the same inputs . stack traces of the some of different errors that I get :
this is the component result : [[1 2.2 1.1]
[4 5 1.1]]
this is the component result : [[1 2.2 1.1]
[4 5 1.1]]
this is the component result : [[6 7 1.1]
[8 9 1.1]
[10 11 1.1]]
this is the component result : [[6 7 1.1]
[8 9 1.1]
[10 11 1.1]]
this is the component result : [[7.05 16.21]
[16.21 42.21]]
this is the component result : [[7.05 16.21]
[16.21 42.21]]
InvalidArgumentError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1364 try:
-> 1365 return fn(*args)
1366 except errors.OpError as e:
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1349 return self._call_tf_sessionrun(options, feed_dict, fetch_list,
-> 1350 target_list, run_metadata)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1442 fetch_list, target_list,
-> 1443 run_metadata)
InvalidArgumentError: {{function_node __inference_innerloop_processing_13658}} {{function_node __inference_innerloop_processing_13658}} Expected size[0] in [0, 0], but got 3
[[{{node while/body/_1/while/Slice}}]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-18-238a2ce9a03a> in <module>
94 l2_splits = np.array([0,3,6,9,12,15])
95 l1_splits = np.array([0, 2, 5 ])
---> 96 re = sess.run([new_oo ] , feed_dict={myTensor_values:vals ,myTensor_l1_splits:l1_splits ,myTensor_l2_splits:l2_splits } )
97 print(re)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
956 try:
957 result = self._run(None, fetches, feed_dict, options_ptr,
--> 958 run_metadata_ptr)
959 if run_metadata:
960 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1179 if final_fetches or final_targets or (handle and feed_dict_tensor):
1180 results = self._do_run(handle, final_targets, final_fetches,
-> 1181 feed_dict_tensor, options, run_metadata)
1182 else:
1183 results = []
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1357 if handle is None:
1358 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359 run_metadata)
1360 else:
1361 return self._do_call(_prun_fn, handle, feeds, fetches)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1382 '\nsession_config.graph_options.rewrite_options.'
1383 'disable_meta_optimizer = True')
-> 1384 raise type(e)(node_def, op, message)
1386 def _extend_graph(self):
InvalidArgumentError: Expected size[0] in [0, 0], but got 3
[[{{node while/body/_1/while/Slice}}]]
and the below error as well :
CancelledError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1364 try:
-> 1365 return fn(*args)
1366 except errors.OpError as e:
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1349 return self._call_tf_sessionrun(options, feed_dict, fetch_list,
-> 1350 target_list, run_metadata)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1442 fetch_list, target_list,
-> 1443 run_metadata)
CancelledError: {{function_node __inference_innerloop_processing_11240}} {{function_node __inference_innerloop_processing_11240}} [_Derived_]Loop execution was cancelled.
[[{{node while/LoopCond/_20}}]]
During handling of the above exception, another exception occurred:
CancelledError Traceback (most recent call last)
<ipython-input-15-238a2ce9a03a> in <module>
94 l2_splits = np.array([0,3,6,9,12,15])
95 l1_splits = np.array([0, 2, 5 ])
---> 96 re = sess.run([new_oo ] , feed_dict={myTensor_values:vals ,myTensor_l1_splits:l1_splits ,myTensor_l2_splits:l2_splits } )
97 print(re)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
956 try:
957 result = self._run(None, fetches, feed_dict, options_ptr,
--> 958 run_metadata_ptr)
959 if run_metadata:
960 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1179 if final_fetches or final_targets or (handle and feed_dict_tensor):
1180 results = self._do_run(handle, final_targets, final_fetches,
-> 1181 feed_dict_tensor, options, run_metadata)
1182 else:
1183 results = []
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1357 if handle is None:
1358 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359 run_metadata)
1360 else:
1361 return self._do_call(_prun_fn, handle, feeds, fetches)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1382 '\nsession_config.graph_options.rewrite_options.'
1383 'disable_meta_optimizer = True')
-> 1384 raise type(e)(node_def, op, message)
1386 def _extend_graph(self):
CancelledError: [_Derived_]Loop execution was cancelled.
[[{{node while/LoopCond/_20}}]]
I believe all errors are thrown inside innerloop_processing
. I have also opened an issue in Tensorflow github here .
it looks like the issue was from tf.Cond and this was fortunately is was reimplemented in tensorflow2. Thus removing the call :
fixes the issue.