I am trying to apply reiforcement learning mechanism to classification tasks. I know it is useless thing to do because deep learning can overperform rl in the tasks. Anyway in research purposes I am doing.
I reward agent if he's correct positive 1 or not negative -1
and computate loss FUNC with predicted_action(predicted_class)
and reward.
But I get an error:
element 0 of tensors does not require grad and does not have a grad_fn
# creating model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.pipe = nn.Sequential(nn.Linear(9, 120),
nn.ReLU(),
nn.Linear(120, 64),
nn.ReLU(),
nn.Linear(64,2),
nn.Softmax()
)
def forward(self, x):
return self.pipe(x)
def env_step(action, label, size):
total_reward = []
for i in range(size):
reward = 0
if action[i] == label[i]:
total_reward.append(reward+1)
continue
else:
total_reward.append(reward-1)
continue
return total_reward
if __name__=='__main__':
epoch_size = 100
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
total_loss = deque(maxlen = 50)
for epoch in range(epoch_size):
batch_index = 0
for i in range(13):
# batch sample
batch_xs = torch.FloatTensor(train_state[batch_index: batch_index+50]) # make tensor
batch_ys = torch.from_numpy(train_label[batch_index: batch_index+50]).type('torch.LongTensor') # make tensor
# action_prob; e.g classification prob
actions_prob = net(batch_xs)
#print(actions_prob)
action = torch.argmax(actions_prob, dim=1).unsqueeze(1)
#print(action)
reward = np.array(env_step(action, batch_ys, 50))
#print(reward)
reward = torch.from_numpy(reward).unsqueeze(1).type('torch.FloatTensor')
#print(reward)
action = action.type('torch.FloatTensor')
optimizer.zero_grad()
loss = criterion(action, reward)
loss.backward()
optimizer.step()
batch_index += 50
action
is produced by the argmax funtion, which is not differentiable. You instead want take the loss between the reward and the responsible probability for the action taken.
Often, the "loss" chosen for the policy in reinfocement learning is the so called score function:
Which is the product of the log of the responsible probablity for the action a
taken times the reward gained.