Search code examples
tensorflownantensor

How can I Impute Nan values with mean of a column in Keras?


I am trying to impute the nan values in a tensor with the mean of the column for that tensor. I know that this can be easily done using the SimpleImputer() for example in sklearn, however, I want to implement all of my feature engineering in Keras or Tensorflow so I can add it as a lambda layer for a Neural Network.

I current have a function like this however I am getting an error:

s = tf.convert_to_tensor(df_train)

def impute_mean(tensor):
    tensor = tf.dtypes.cast(tensor, tf.float32)
    mean = tft.mean(tensor)
    tensor = tf.where(tf.math.is_nan(tensor, mean))
    return tensor

d = impute_mean(s)
d
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_37096\4009563778.py in <module>
     12     return tensor
     13 
---> 14 d = impute_mean(s)
     15 d

~\AppData\Local\Temp\ipykernel_37096\4009563778.py in impute_mean(tensor)
      8 def impute_mean(tensor):
      9     tensor = tf.dtypes.cast(tensor, tf.float32)
---> 10     mean = tft.mean(tensor)
     11     tensor = tf.where(tf.math.is_nan(tensor, mean))
     12     return tensor

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\common.py in wrapped_fn(*args, **kwargs)
     71             collection.append(collections.Counter())
     72           collection[0][fn.__name__] += 1
---> 73           return fn(*args, **kwargs)
     74       else:
     75         return fn(*args, **kwargs)

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in mean(x, reduce_instance_dims, name, output_dtype)
    842   """
    843   with tf.compat.v1.name_scope(name, 'mean'):
--> 844     return _mean_and_var(x, reduce_instance_dims, output_dtype)[0]
    845 
    846 

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in _mean_and_var(x, reduce_instance_dims, output_dtype)
    909     x_mean, x_var = _apply_cacheable_combiner(
    910         WeightedMeanAndVarCombiner(output_dtype.as_numpy_dtype, output_shape),
--> 911         *combine_inputs)
    912 
    913   return x_mean, x_var

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in _apply_cacheable_combiner(combiner, *tensor_inputs)
    170   outputs_value_nodes = apply_cacheable_combine_operation(
    171       combiner, *tensor_inputs)
--> 172   return tuple(map(analyzer_nodes.wrap_as_tensor, outputs_value_nodes))  # pytype: disable=bad-return-type
    173 
    174 

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in wrap_as_tensor(output_value_node)
    320   return bind_future_as_tensor(
    321       output_value_node,
--> 322       analyzer_def.output_tensor_infos[output_value_node.value_index])
    323 
    324 

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in bind_future_as_tensor(future, tensor_info, name)
    310     return _bind_future_as_tensor_v2(future, tensor_info, name)
    311   else:
--> 312     return _bind_future_as_tensor_v1(future, tensor_info, name)
    313 
    314 

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in _bind_future_as_tensor_v1(future, tensor_info, name)
    140                               name: Optional[str] = None) -> tf.Tensor:
    141   """Bind a future value as a tensor to a TF1 graph."""
--> 142   result = tf.compat.v1.placeholder(tensor_info.dtype, tensor_info.shape, name)
    143   is_asset_filepath = tensor_info.temporary_asset_info is not None
    144   tf.compat.v1.add_to_collection(TENSOR_REPLACEMENTS,

~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow\python\ops\array_ops.py in placeholder(dtype, shape, name)
   3341   """
   3342   if context.executing_eagerly():
-> 3343     raise RuntimeError("tf.placeholder() is not compatible with "
   3344                        "eager execution.")
   3345 

RuntimeError: tf.placeholder() is not compatible with eager execution.

Solution

  • Replicating sklearn.impute.SimpleImputer in tensorflow:

    sklearn:

    from sklearn.impute import SimpleImputer
    
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    s = np.array([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
    imp_mean.fit(s)
    X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
    print(imp_mean.transform(X))
    #output
    [[ 7.   2.   3. ]
     [ 4.   3.5  6. ]
     [10.   3.5  9. ]]
    

    Tensorflow:

    #compute imputer mean
    mask = tf.where(tf.math.is_nan(s) , 0. , s)
    mask_norm = tf.reduce_sum(tf.clip_by_value(mask, 0., 1.),axis=0)
    imp_mean = tf.math.divide(tf.reduce_sum(mask, axis=0), mask_norm)
    
    #transform
    tf.where(tf.math.is_nan(X) , imp_mean , X)
    
    #output
    [[ 7. ,  2. ,  3. ],
     [ 4. ,  3.5,  6. ],
     [10. ,  3.5,  9. ]],
    

    2.09 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)