Search code examples
pythonpytorch

Derivative of Neural Network in Pytorch


I have implemented and trained a neural network in Pytorch, however, I am interested in the derivative of the neural network parameters with respect to the input. I have extensively searched for any procedure to that would allow evaluating the derivative of weights with respect to a given input, but I did not find anything. I know that I can compute the gradients of a function in the following way.

external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

But How would I do that with a trained neural network instead of a function Q?

Thanks in advance.

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable


# In[2]:


import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
from torch import optim


# In[3]:


nSimul = 32768
T1 = 1.0
T2 = 2.0
K = 110.0

spot = 100.0
vol = 0.2
vol0 = 0.5 # vol is increased over the 1st period so we have more points in the wings


# simulate all Gaussian returns (N1, N2) first
# returns: matrix of shape [nSimul, TimeSteps=2]
returns = np.random.normal(size=[nSimul,2])

# generate paths, step by step, and not path by path as customary
# this is to avoid slow Python loops, using NumPy's optimized vector functions instead

# generate the vector of all scenarios for S1, of shape [nSimul]
S1 = spot * np.exp(-0.5*vol0*vol0*T1 + vol0*np.sqrt(T1)*returns[:,0])

# generate the vector of all scenarios for S2, of shape [nSimul]
S2 = S1 * np.exp(-0.5*vol*vol*(T2-T1) + vol*np.sqrt(T2-T1)*returns[:,1])

# training set, X and Y are both vectors of shape [nSimul]
X = S1
Y = np.maximum(0, S2 - K)
xAxis = np.linspace(20, 200, 100)

xAxis=xAxis.reshape(-1,1)


# In[4]:


#Normalization of the simulated data:

meanX = np.mean(X)
stdX = np.std(X)
meanY = np.mean(Y)
stdY = np.std(Y)

normX = (X - meanX) / stdX
normY = (Y - meanY) / stdY


normX=normX.reshape(-1,1)

normY=normY.reshape(-1,1)


# In[5]:


class NeuralNetwork(nn.Module):
    def __init__(self,inputsize,outputsize):
        super(NeuralNetwork, self).__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(inputsize,3),
            nn.ELU(),
            nn.Linear(3, 5),
            nn.ELU(),
            nn.Linear(5,3), 
            nn.ELU(),
            nn.Linear(3,outputsize),
        )
        w = torch.empty(0,1)
        nn.init.normal_(w)
    
    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


# In[6]:


inputDim = 1       # takes variable 'x' 
outputDim = 1       # takes variable 'y'
learningRate = 0.05
epochs = 10000
#weight=torch.empty(3)
model = NeuralNetwork(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
    model.cuda()
    


# In[7]:


#criterion = torch.nn.MSELoss() 
#optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)


# In[ ]:


def ridge_loss(outputs,labels):
    torch.mean((outputs-labels)**2)
    
    
    
    
    
    
    


# In[ ]:





# In[9]:


#Adam optmization
criterion = torch.nn.MSELoss() 
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.05)


# In[10]:


for epoch in range(epochs):
    # Converting inputs and labels to Variable
    if torch.cuda.is_available():
        inputs = Variable(torch.from_numpy(normX).cuda().float())
        labels = Variable(torch.from_numpy(normY).cuda().float())
    else:
        inputs = Variable(torch.from_numpy(normX).float())
        labels = Variable(torch.from_numpy(normY).float())

    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    print(loss)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))


# In[11]:


def predict(xs):
    # first, normalize
    nxs = (xs - meanX) / stdX
    # forward feed through ANN
    # we don't need gradients in the testing phase
    with torch.no_grad():
        if torch.cuda.is_available():
            nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
        else:
            nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
    
    # de-normalize output
    ys = meanY + stdY * nys
    # we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
    return np.reshape(ys, [-1])


# In[13]:


def BlackScholes(S0,r,sigma,T,K):
    d1 =  1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
    d2 = d1 - sigma * np.sqrt(T)
    return norm.cdf(d1) * S0 - norm.cdf(d2) * K * np.exp(-r*T)

def BlackScholesCallDelta(S0,r,sigma,T,K):
    d1 =  1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
    return norm.cdf(d1)

BlackScholes_vec=np.vectorize(BlackScholes)

BlackScholesCallDelta_vec=np.vectorize(BlackScholesCallDelta)


# In[14]:


BS_price=BS_prices=BlackScholes_vec(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)

S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis,predicted,label="Neural Regression")
line_BS = plt.plot(xAxis,BS_price, label="Black-Scholes")

plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")


# In[15]:


Prices_rg_mc_diff=[]

for i in range(len(xAxis)-1):
    delta=(predicted[i+1]-predicted[i])/(xAxis[i+1]-xAxis[i])
    Prices_rg_mc_diff.append(delta) 


# In[16]:


BS_delta=BlackScholesCallDelta(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)

S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis[1:],Prices_rg_mc_diff,label="Neural Regression")
line_BS = plt.plot(xAxis[1:],BS_delta[1:], label="Black-Scholes")

plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")


# In[17]:


model.backward(retain_graph=True)


# In[ ]:


print(NeuralNetwork.weight.grad)


# In[ ]:


def predict(xs):
    # first, normalize
    nxs = (xs - meanX) / stdX
    # forward feed through ANN
    # we don't need gradients in the testing phase
    with torch.no_grad():
        if torch.cuda.is_available():
            nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
        else:
            nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
    
    # de-normalize output
    ys = meanY + stdY * nys
    # we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
    return np.reshape(ys, [-1])


# In[21]:


c3=torch.from_numpy((predicted.reshape(-1,1)), requires_grad=True) 
c4=torch.from_numpy(xAxis, requires_grad=True) 
#c5=torch.Tensor(c3) 
#c6=torch.Tensor(c4)  
loss = criterion(c3,c4) # calculating loss   

loss.backward()


# In[28]:


torch.tensor(predicted.reshape(-1,1), requires_grad=True)
torch.tensor(xAxis, requires_grad=True)

criterion(torch.tensor(predicted.reshape(-1,1), requires_grad=True),torch.tensor(xAxis, requires_grad=True))


loss.backward()


Solution

  • You need to explicitly use requires_grad = True when create a tensor. And to calculate gradient you first need to apply some operation on the tensor.

    Here is an example:

    import torch
    
    x = torch.rand(2, 2, requires_grad=True)
    y = x + 2
    z = y * y * 3
    out = z.mean()
    
    out.backward()
    
    print(x.grad)
    

    Output:

    tensor([[3.3720, 3.4302],
            [3.4030, 3.3605]])
    

    In this way you are using torch.autograd to calculate the gradient for tensor x. See autograd for more.

    And for neural network you can simply use the network and backward it afterward.

    A neural network Example:

    import torch
    import torch.nn as nn
    import torch.nn.functional as f
    
    x = torch.rand(2, 2)
    
    # define a neural network
    network = nn.Sequential(
    nn.Linear(2,100),
    nn.Linear(100,2)
    )
    
    pred = network(x)
    
    loss = f.mae_loss(pred, x) # calculating loss 
    
    loss.backward()
    
    # Update weights with gradients
    network[0].weight = 0.1 * network[0].weight.grad
    network[1].weight = 0.1 * network[1].weight.grad
    

    Note: I didn't put any activation function in network for the sack of simplicity.

    Example of backward() using torch.nn.MSELoss():

    import torch
    from torch.nn import MSELoss
    
    criterion = MSELoss()
    
    a = torch.tensor([1.,2.], requires_grad=True)
    b = a**2
    
    loss = criterion(b, a)
    
    loss.backward()
    
    print(a.grad)
    

    Output:

    tensor([0., 6.])