I am trying to run vanilla policy gradient algorithm and render the Open AI environment "CartPole-v1"
.
The code for the algorithm is given below, and runs well without any errors. The Jupyer notebook for this code may be found here.
en%pylab inline
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
import gym
from tqdm import trange
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import *
env = gym.make("CartPole-v1")
env.observation_space, env.action_space
x = in1 = Input(env.observation_space.shape)
x = Dense(32)(x)
x = Activation('tanh')(x)
x = Dense(env.action_space.n)(x)
x = Lambda(lambda x: tf.nn.log_softmax(x, axis=-1))(x)
m = Model(in1, x)
def loss(y_true, y_pred):
# y_pred is the log probs of the actions
# y_true is the action mask weighted by sum of rewards
return -tf.reduce_sum(y_true*y_pred, axis=-1)
m.compile(Adam(1e-2), loss)
m.summary()
lll = []
# this is like 5x faster than calling m.predict and picking in numpy
pf = K.function(m.layers[0].input, tf.random.categorical(m.layers[-1].output, 1)[0])
tt = trange(40)
for epoch in tt:
X,Y = [], []
ll = []
while len(X) < 8192:
obs = env.reset()
acts, rews = [], []
while True:
# pick action
#act_dist = np.exp(m.predict_on_batch(obs[None])[0])
#act = np.random.choice(range(env.action_space.n), p=act_dist)
# pick action (fast!)
act = pf(obs[None])[0]
# save this state action pair
X.append(np.copy(obs))
acts.append(act)
# take the action
obs, rew, done, _ = env.step(act)
rews.append(rew)
if done:
for i, act in enumerate(acts):
act_mask = np.zeros((env.action_space.n))
act_mask[act] = np.sum(rews[i:])
Y.append(act_mask)
ll.append(np.sum(rews))
break
loss = m.train_on_batch(np.array(X), np.array(Y))
lll.append((np.mean(ll), loss))
tt.set_description("ep_rew:%7.2f loss:%7.2f" % lll[-1])
tt.refresh()
plot([x[0] for x in lll], label="Mean Episode Reward")
plot([x[1] for x in lll], label="Epoch Loss")
plt.legend()
When I try to render the environment I get an IndexError:
obs = env.reset()
rews = []
while True:
env.render()
pred, act = [x[0] for x in pf(obs[None])]
obs, rew, done, _ = env.step(np.argmax(pred))
rews.append(rew)
time.sleep(0.05)
if done:
break
print("ran %d steps, got %f reward" % (len(rews), np.sum(rews)))
in (.0) 3 while True: 4 env.render() ----> 5 pred, act = [x[0] for x in pf(obs[None])] 6 obs, rew, done, _ = env.step(np.argmax(pred)) 7 rews.append(rew)
IndexError: invalid index to scalar variable.
I read that this happens when you try to index a numpy
scalar such as numpy.int64
or numpy.float64
, however I am not sure where the error is stemming from and how I should go about solving this issue. Any help or suggestions would be appreciated.
Looks like you might've changed the way pf
works but forgotten to update your rendering code.
Try this (I haven't tested it):
act, = pf(obs[None]) # same as pf(obs[None])[0] but asserts shape
obs, rew, done, _ = env.step(act)
This will select the action randomly like at training time -- if you want the greedy action you'll need to change a few more things.