Search code examples
tensorflowvariablestensorflow2.0gradientgradienttape

Calculate gradients of variables used in assignment of other variable using tf.GradientTape


How can one calculate the gradient on a variable with respect to another variable used in a linear combination? The following code is executed in TensorFlow eager mode.

Some more digging in older questions, a similar question showed up. However, it is not clear on how to solve this issue. Another related question is this one, but here the same variable is reused and TensorFlow v1.

I also read in this question that tf.assign (v1?) does not support gradients and a potential solution is provided there. However, I'd apply it in context of internal model weights of neural networks, but I don't know how to apply that tensor-approach in practice.

a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')

with tf.GradientTape() as tape:
  c.assign(a + b)
  loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # prints None

# or another attempt
with tf.GradientTape(watch_accessed_variables=False) as tape:
   tape.watch([b,c])
   c.assign(a + b)
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # also outputs None

# Working, but c is a variable in my use case
with tf.GradientTape() as tape:
   c = a + b
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # Works

Extension:

import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.01)

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    loss = tf.norm(c) # scalar

# This works as expected
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=0.0024197185>, <tf.Tensor: shape=(), dtype=float32, numpy=0.009702832>]
# Here I would expect a 1D gradient to use the Gradient Descent method?
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=1.4518311>, <tf.Tensor: shape=(), dtype=float32, numpy=5.8216996>]

# Example what I'd like to achieve;
with tf.GradientTape() as tape:
  c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
  loss = tf.norm(c_) # scalar

print(tape.gradient(loss,x)) 
# tf.Tensor(5.0933886, shape=(), dtype=float32)

A more sophisticated issue:

import tensorflow as tf

a = [tf.Variable([1.0, 2.0], name='a'), tf.Variable([5.0], name='aa'), tf.Variable(7.0, name='aaa')]
b = [tf.Variable([3.0, 4.0], name='b'), tf.Variable([6.0], name='bb'), tf.Variable(8.0, name='aaa')]
c = [tf.Variable([1.0, 1.0], name='c'), tf.Variable([1.0], name='cc'), tf.Variable(1.0, name='ccc')]
x = tf.Variable(0.5, name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)

    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)

    loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
    loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))

print(loss, loss_without_assign)
# tf.Tensor(9.974969, shape=(), dtype=float32) tf.Tensor(9.974969, shape=(), dtype=float32)

# Gives same result
#partial_grads = tf.nest.map_structure(lambda d, e: tf.nest.map_structure(lambda f, g: tape.gradient(loss, f, output_gradients=tape.gradient(g, x)), d, e), c, c_)
partial_grads = tf.nest.map_structure(lambda d, e: tape.gradient(loss, d, output_gradients=tape.gradient(e, x)), c, c_)

# Should not use mean?
print(tf.reduce_sum(tf.nest.map_structure(lambda z: tf.reduce_mean(z), partial_grads)))
print(tape.gradient(loss_without_assign, x))
# Rather close
# tf.Tensor(2.3057716, shape=(), dtype=float32)
# tf.Tensor(2.3057709, shape=(), dtype=float32)

Solution

  • Maybe you can try as following:

    import tensorflow as tf
    a = tf.Variable(1.0, name='a')
    b = tf.Variable(2.0, name='b')
    c = tf.Variable(3.0, name='c')
    
    with tf.GradientTape(persistent=True) as tape:
      c_ = a + 2*b
      c.assign(c_)
      loss = tf.reduce_mean(c**2)
    
    print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b))) 
    # tf.Tensor(20.0, shape=(), dtype=float32)
    

    P.S. output_gradients is a parameter of tf.GradientTape.gradient that hidden in the corner and rarely found, which can be used to manually build cascade differentiation.

    • For Extension:
    import tensorflow as tf
    a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
    b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
    c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
    x = tf.Variable(0.0, name='x')
    
    with tf.GradientTape(persistent=True) as tape:
        c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
        tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
        loss = tf.norm(c) # scalar
    print(tape.gradient(loss,c[0],output_gradients=tape.gradient(c_[0],x))+\
          tape.gradient(loss,c[1],output_gradients=tape.gradient(c_[1],x)))
    # tf.Tensor(5.0932484, shape=(), dtype=float32)
    

    Explaination:

    Because tf.GradientTape is based on the matrix differential theory, but will collect all derivation of a same variable (add them to a whole) after .gradient(). Such as derive a vector with respect to a scalar, in matrix theory, we will get a vector derivation, but in tf.GradientTape, a reduce_sum like will be applied then to got a summated scalar.

    Here tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)) acctually did:

    tape.gradient(loss,c[0],output_gradients=tape.gradient(c_,x)[0]),
    tape.gradient(loss,c[1],output_gradients=tape.gradient(c_,x)[1])
    
    but 
    tape.gradient(c_,x)[0] != tape.gradient(c_[0],x)
    tape.gradient(c_,x)[1] != tape.gradient(c_[1],x)
    

    So tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)) contrary to our original intention.

    • For the more sophisticated issue: jacobian is needed
    import tensorflow as tf
    
    tf.keras.utils.set_random_seed(0)
    a = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
    b = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
    c = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
    x = tf.Variable(tf.random.normal(shape=[]), name='x')
    
    with tf.GradientTape(persistent=True) as tape:
        c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    
        tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    
        loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
        loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))
    
    print(loss, loss_without_assign)
    print(tf.reduce_sum([
        tf.reduce_sum(tape.jacobian(c_[0],x)*tape.gradient(loss,c[0])),
        tf.reduce_sum(tape.jacobian(c_[1],x)*tape.gradient(loss,c[1])),
        tf.reduce_sum(tape.jacobian(c_[2],x)*tape.gradient(loss,c[2]))
        ]))
    # tf.Tensor(0.7263656, shape=(), dtype=float32)
    print(tape.gradient(loss_without_assign, x))
    # tf.Tensor(0.7263656, shape=(), dtype=float32)