I am trying to do batch processing with nn.lstm
From the documentation https://pytorch.org/docs/master/generated/torch.nn.LSTM.html I get that h0 and c0 should be of dimension:(num_layers * num_directions, batch, hidden_size).
But when I am trying to give input tensor with batch size>1 and h0 , c0 batch size>1. It is giving me error stating: "RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)"
Here is my code: it contains 1 memory buffer, Actor, Critic, TD3, ENV classes and main training is in TD3 which has actor and critic objects.
Can someone please help a check what am i missing here.
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from random import random as rndm
from torch.autograd import Variable
from collections import deque
import pandas_datareader.data as pdr
import datetime
os.chdir('C:\\Users\\granthjain\\Desktop\\startup_code')
torch.set_default_tensor_type('torch.DoubleTensor')
f = open('lstm_with_noise_batch.txt', 'w+')
class ReplayBuffer(object):
def __init__(self, max_size=1e6):
self.storage = []
self.max_size = max_size
self.ptr = 0
def add(self, transition):
if len(self.storage) == self.max_size:
self.storage[int(self.ptr)] = transition
else:
self.storage.append(transition)
self.ptr = (self.ptr + 1) % self.max_size
def sample(self, batch_size):
ind = np.random.randint(0, self.ptr, size=batch_size)
ind = np.random.randint(self.ptr)
(batch_states, batch_next_states, batch_actions, batch_rewards,
batch_dones) = ([], [], [], [], [])
for i in range(ind - batch_size, ind):
(state, next_state, action, reward, done) = self.storage[i]
if state is None:
continue
elif next_state is None:
continue
elif action is None:
continue
elif reward is None:
continue
elif done is None:
continue
batch_states.append(np.array(state, copy=False))
batch_next_states.append(np.array(next_state, copy=False))
batch_actions.append(np.array(action, copy=False))
batch_rewards.append(np.array(reward, copy=False))
batch_dones.append(np.array(done, copy=False))
return (np.array(batch_states, dtype=object).astype(float),
np.array(batch_next_states,
dtype=object).astype(float), np.array(batch_actions,
dtype=object).astype(float), np.array(batch_rewards,
dtype=object).astype(float), np.array(batch_dones,
dtype=object).astype(float))
class Actor(nn.Module):
def __init__(
self,
state_dim,
action_dim,
max_action,
):
super(Actor, self).__init__()
self.lstm = nn.LSTM(state_dim, 256)
self.layer_1 = nn.Linear(256, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x, hx):
(hx, cx) = hx
(output, (hx, cx)) = self.lstm(x, (hx, cx))
x = F.relu(self.layer_1(output))
x = F.relu(self.layer_2(x))
x = self.max_action * torch.tanh(self.layer_3(x))
# print("inside forward type cx:",len(output))
return (x, hx, cx)
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
# Defining the first Critic neural network
self.lstm1 = nn.LSTM(state_dim + action_dim, 256)
self.layer_1 = nn.Linear(256, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, 1)
# Defining the second Critic neural network
self.lstm2 = nn.LSTM(state_dim + action_dim, 256)
self.layer_4 = nn.Linear(256, 400)
self.layer_5 = nn.Linear(400, 300)
self.layer_6 = nn.Linear(300, 1)
def forward(
self,
x,
u,
hx,
):
xu = torch.cat([x, u], 1)
# Forward-Propagation on the first Critic Neural Network
xu = torch.reshape(xu, (xu.shape[0], 1, 6))
(hx1, cx1) = hx
(hx2, cx2) = hx
(output, (hx1, cx1)) = self.lstm1(xu, (hx1, hx2))
x1 = F.relu(self.layer_1(output))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
# Forward-Propagation on the second Critic Neural Network
(output, (hx2, cx2)) = self.lstm2(xu, (hx2, cx2))
x2 = F.relu(self.layer_4(output))
x2 = F.relu(self.layer_5(x2))
x2 = self.layer_6(x2)
return (
x1,
x2,
hx1,
hx2,
cx1,
cx2,
)
def Q1(
self,
x,
u,
hx1,
):
xu = torch.cat([x, u], 1)
xu = torch.reshape(xu, (xu.shape[0], 1, 6))
(hx1, cx1) = hx1
(output, (hx1, cx1)) = self.lstm1(xu, (hx1, cx1))
x1 = F.relu(self.layer_1(output))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
return (x1, hx1, cx1)
class TD3(object):
def __init__(
self,
state_dim,
action_dim,
max_action,
):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = Actor(state_dim, action_dim,
max_action).to(device)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = \
torch.optim.Adam(self.critic.parameters())
self.max_action = max_action
def select_action(self, state, hx1):
(hx, cx) = hx1
x = self.actor(state, hx1)
return x
def train(
self,
replay_buffer,
iterations,
batch_size=50,
discount=0.99,
tau=0.005,
policy_noise=0.2,
noise_clip=0.5,
policy_freq=2,
):
b_state = torch.Tensor([])
b_next_state = torch.Tensor([])
b_done = torch.Tensor([])
b_reward = torch.Tensor([])
b_action = torch.Tensor([])
for it in range(iterations):
# print ('it: ', it, ' iterations: ', iterations)
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
(batch_states, batch_next_states, batch_actions,
batch_rewards, batch_dones) = \
replay_buffer.sample(batch_size)
batch_states = batch_states.astype(float)
batch_next_states = batch_next_states.astype(float)
batch_actions = batch_actions.astype(float)
batch_rewards = batch_rewards.astype(float)
batch_dones = batch_dones.astype(float)
state = torch.from_numpy(batch_states)
next_state = torch.from_numpy(batch_next_states)
action = torch.from_numpy(batch_actions)
reward = torch.from_numpy(batch_rewards)
done = torch.from_numpy(batch_dones)
b_size = 1
seq_len = state.shape[0]
batch = b_size
input_size = state_dim
state = torch.reshape(state, (seq_len, 1, state_dim))
next_state = torch.reshape(next_state, (seq_len, 1,
state_dim))
done = torch.reshape(done, (seq_len, 1, 1))
reward = torch.reshape(reward, (seq_len, 1, 1))
action = torch.reshape(action, (seq_len, 1, action_dim))
b_state = torch.cat((b_state, state),dim=1)
b_next_state = torch.cat((b_next_state, next_state),dim=1)
b_done = torch.cat((b_done, done),dim=1)
b_reward = torch.cat((b_reward, reward),dim=1)
b_action = torch.cat((b_action, action),dim=1)
print("dim state:",b_state.shape)
print("dim next_state:",b_next_state.shape)
print("dim done:",b_done.shape)
print("dim reward:",b_reward.shape)
print("dim action:",b_action.shape)
# for h and c shape (num_layers * num_directions, batch, hidden_size)
h0 = torch.zeros(1, b_state.shape[1], 256)
c0 = torch.zeros(1, b_state.shape[1], 256)
# Step 5: From the next state s’, the Actor target plays the next action a’
next_action = self.actor_target(next_state, (h0, c0))
next_action = next_action[0]
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(next_action).data.normal_(0,
policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action,
self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
result = self.critic_target(next_state, next_action, (h0,
c0))
target_Q1 = result[0]
target_Q2 = result[1]
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2).double()
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = reward + (1 - done) * discount * target_Q
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
action = torch.reshape(action, next_action.shape)
result = self.critic(state, action, (h0, c0))
current_Q1 = result[0]
current_Q2 = result[1]
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) \
+ F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
out = self.actor(state, (h0, c0))
out = out[0]
(actor_loss, hx, cx) = self.critic.Q1(state, out, (h0,
c0))
actor_loss = -1 * actor_loss.mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for (param, target_param) in zip(self.actor.parameters(),
self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau)
* target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for (param, target_param) in zip(self.critic.parameters(),
self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau)
* target_param.data)
# Making a save method to save a trained model
def save(self, filename, directory):
torch.save(self.actor.state_dict(), '%s/%s_actor.pth'
% (directory, filename))
torch.save(self.critic.state_dict(), '%s/%s_critic.pth'
% (directory, filename))
# Making a load method to load a pre-trained model
def load(self, filename, directory):
self.actor.load_state_dict(torch.load('%s/%s_actor.pth'
% (directory, filename)))
self.critic.load_state_dict(torch.load('%s/%s_critic.pth'
% (directory, filename)))
class ENV:
def __init__(
self,
state_dim,
action_dim,
data,
):
self.state_dim = state_dim
self.state = torch.zeros(self.state_dim)
self.state[state_dim - 1] = 100000.0
self.next_state = torch.zeros(self.state_dim)
self.next_state[state_dim - 1] = 100000.0
self.action_dim = action_dim
self.data = data
self.idx = 0
self._max_episode_steps = 200
self.state[1] = self.data[self.idx]
self.next_state[1] = self.data[self.idx]
self.buy = 0
def reset(self):
self.next_state = torch.zeros(self.state_dim)
self.next_state[state_dim - 1] = 100000.0
self.state = torch.zeros(self.state_dim)
self.state[state_dim - 1] = 100000.0
self.state[1] = self.data[self.idx]
self.next_state[1] = self.data[self.idx]
ch = self.state[0]
cp = self.state[1]
cc = self.state[2]
st = torch.tensor([ch, cp, cc])
self.buy = 0
return st
def step(self, action):
done = False
act_t = torch.argmax(action)
self.idx += 1
if act_t == 0:
cp = 1.0003 * self.state[1]
num_s = int(self.state[2] / cp)
self.next_state[0] += num_s
self.next_state[2] = self.state[2] % cp
self.next_state[1] = self.data[self.idx]
self.buy = 1
elif act_t == 1:
self.next_state[1] = self.data[self.idx]
elif act_t == 2:
self.next_state[2] = self.state[2] + self.state[1] * (1
- 0.0023) * self.state[0]
self.next_state[0] = 0
self.next_state[1] = self.data[self.idx]
if self.buy == 1:
done = True
self.buy = 0
reward = self.next_state[2] - self.state[2] \
+ self.next_state[1] * self.next_state[0] - self.state[1] \
* self.state[0] - 1
self.state[0] = self.next_state[0]
self.state[1] = self.next_state[1]
self.state[2] = self.next_state[2]
ch = self.state[0]
cp = self.state[1]
cc = self.state[2]
st = torch.tensor([ch, cp, cc])
return (st, reward, done)
# Selecting the device (CPU or GPU)
device = torch.device(('cuda' if torch.cuda.is_available() else 'cpu'))
# set the parameters
start_timesteps = 1e3 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e1 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e3 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 200 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated
state_dim = 3
action_dim = 3
max_action = 1
idx = 0
# instantiate policy
policy = TD3(state_dim, action_dim, max_action)
indices = pd.read_csv('nifty_test.csv')
indices = indices['0']
indices = pd.read_csv('EQUITY_L.csv')
indices = indices['SYMBOL']
# Create the environment for each ticker
# data = pd.read_csv('PAGEIND.csv')
for ticker in indices:
print(ticker)
ohlcv = pd.read_csv(ticker + '.csv')
data = ohlcv.copy()
data = data['Close']
data = np.array(data).reshape(-1, 1)
count = 0
max_timesteps = data.shape[0]
data = torch.DoubleTensor(data)
env = ENV(state_dim, action_dim, data)
replay_buffer = ReplayBuffer()
# init training variables
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()
obs = env.reset()
hx = torch.zeros(1, 1, 256)
cx = torch.zeros(1, 1, 256)
# Set rewards and episode timesteps to zero
episode_reward = 0
episode_timesteps = 0
episode_num = 0
# We start the main loop over max_timesteps
while total_timesteps < max_timesteps:
# If the episode is done
if done | (total_timesteps == max_timesteps - 2) \
& (episode_timesteps > 200):
count = count + 1
if (count % 100 == 0) & (count >= 100) \
| (total_timesteps == max_timesteps - 2) \
& (episode_timesteps > 200):
# If we are not at the very beginning, we start the training process of the model
if total_timesteps != 0:
print('Total Timesteps: {} Episode Num: {} Reward: {}'.format(total_timesteps,
episode_num, episode_reward))
policy.train(
replay_buffer,
episode_timesteps,
batch_size,
discount,
tau,
policy_noise,
noise_clip,
policy_freq,
)
if total_timesteps > 0.6 * max_timesteps + 1:
print('model output: Total Timesteps: {} Episode Num: {} Reward: {}'.format(total_timesteps,
episode_num, episode_reward))
f.write('model output: Total Timesteps: '
+ str(total_timesteps)
+ ' episode_num '
+ str(episode_num)
+ ' episode_reward '
+ str(episode_reward))
# When the training step is done, we reset the state of the environment
obs = env.reset()
# Set the Done to False
done = False
# Set rewards and episode timesteps to zero
episode_reward = 0
episode_timesteps = 0
episode_num += 1
hx = torch.zeros(1, 1, 256)
cx = torch.zeros(1, 1, 256)
# Before 1000 timesteps, we play random actions
if total_timesteps < 0.6 * max_timesteps:
# random action
actn = torch.randn(action_dim)
action = torch.zeros(action_dim)
action[torch.argmax(actn)] = 1
else:
# After 1000 timesteps, we switch to the model
# input of shape (seq_len, batch, input_size)
obs1 = torch.reshape(obs, (1, 1, state_dim))
action = policy.select_action(obs1, (hx, cx))
actn = action[0]
hx = action[1]
cx = action[2]
# If the explore_noise parameter is not 0, we add noise to the action and we clip it
if expl_noise != 0:
print ('policy action:', actn)
actn = actn + torch.randn(action_dim)
action = torch.zeros(action_dim)
action[torch.argmax(actn)] = 1
# The agent performs the action in the environment, then reaches the next state and receives the reward
(new_obs, reward, done) = env.step(action)
# We check if the episode is done
done_bool = (0 if episode_timesteps + 1
== env._max_episode_steps else float(done))
# We increase the total reward
episode_reward += reward
# We store the new transition into the Experience Replay memory (ReplayBuffer)
replay_buffer.add((obs, new_obs, action, reward, done_bool))
# We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
obs = new_obs
episode_timesteps += 1
total_timesteps += 1
timesteps_since_eval += 1
f.close()
and below is the output:
20MICRONS
Total Timesteps: 611 Episode Num: 0 Reward: -53044.2697380831
dim state: torch.Size([200, 611, 3])
dim next_state: torch.Size([200, 611, 3])
dim done: torch.Size([200, 611, 1])
dim reward: torch.Size([200, 611, 1])
dim action: torch.Size([200, 611, 3])
Traceback (most recent call last):
File "C:\Users\granthjain\Desktop\try_lstm.py", line 538, in <module>
policy_freq,
File "C:\Users\granthjain\Desktop\try_lstm.py", line 279, in train
next_action = self.actor_target(next_state, (h0, c0))
File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "C:\Users\granthjain\Desktop\try_lstm.py", line 106, in forward
(output, (hx, cx)) = self.lstm(x, (hx, cx))
File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 567, in forward
self.check_forward_args(input, hx, batch_sizes)
File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 523, in check_forward_args
'Expected hidden[0] size {}, got {}')
File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 187, in check_hidden_size
raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)
Did you make the input dimensions as required in nn.LSTM as well? I saw that you haven't set batch_first = True, and hence the input tensor has to be in the form