I'm trying to run the code from here (Github link on this page): https://keras.io/examples/rl/ppo_cartpole/
I'm getting an attribute error in the training section from observation = observation.reshape(1,-1)
which says "'tuple' object has no attribute 'reshape'".
It seems that observation
is currently env.reset()
which is a tuple of an array (initial observation) and an empty dictionary (info). I've tried applying this to just the array using observation[0].reshape(1,-1)
or env.reset[0]
but that throws up a "too many values to unpack (expected 4)" error two lines later. Does anyone know how I can fix this without messing up the rest of the code?
Minimal reproducible example as requested
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
env = gym.make("CartPole-v0")
steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n
observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)
observation, episode_return, episode_length = env.reset(), 0, 0
for epoch in range(epochs):
# Initialize the sum of the returns, lengths and number of episodes
for each epoch
sum_return = 0
sum_length = 0
num_episodes = 0
for t in range(steps_per_epoch):
if render:
env.render()
observation = observation.reshape(1, -1)
logits, action = sample_action(observation)
observation_new, reward, done, _ = env.step(action[0].numpy())
episode_return += reward
episode_length += 1
# Get the value and log-probability of the action
value_t = critic(observation)
logprobability_t = logprobabilities(logits, action)
# Store obs, act, rew, v_t, logp_pi_t
buffer.store(observation, action, reward, value_t, logprobability_t)
# Update the observation
observation = observation_new
# Finish trajectory if reached to a terminal state
terminal = done
if terminal or (t == steps_per_epoch - 1):
last_value = 0 if done else critic(observation.reshape(1, -1))
buffer.finish_trajectory(last_value)
sum_return += episode_return
sum_length += episode_length
num_episodes += 1
observation, episode_return, episode_length = env.reset(), 0, 0
where
def mlp(x, sizes, activation=tf.tanh, output_activation=None):
# Build a feedforward neural network
for size in sizes[:-1]:
x = layers.Dense(units=size, activation=activation)(x)
return layers.Dense(units=sizes[-1], activation=output_activation)(x)
and
@tf.function
def sample_action(observation):
logits = actor(observation)
action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
return logits, action
env.reset()
returns observation
and info
, where info
is empty. In our case, you can simply do:
observation, info = env.reset()
When the environment is reset via env.reset()
it returns only observation, info
, but when you make steps in the environment via env.step(...)
, it returns 4 variables: observation, reward, done, info
instead. Your typical training loop should look like this:
for episode in range(10):
observation, info = env.reset()
done = False
while not done:
observation, reward, done, info = env.step(...)
It turns out that CartPole environment returns 5 variables instead of 4 when using env.step()
. Here's the code:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
def mlp(x, sizes, activation=tf.tanh, output_activation=None):
# Build a feedforward neural network
for size in sizes[:-1]:
x = layers.Dense(units=size, activation=activation)(x)
return layers.Dense(units=sizes[-1], activation=output_activation)(x)
@tf.function
def sample_action(observation):
logits = actor(observation)
action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
return logits, action
env = gym.make("CartPole-v0")
steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n
observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)
for epoch in range(epochs):
# Initialize the sum of the returns, lengths and number of episodes
# for each epoch
observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- HERE
sum_return = 0
sum_length = 0
num_episodes = 0
for t in range(steps_per_epoch):
if render:
env.render()
observation = observation.reshape(1, -1)
logits, action = sample_action(observation)
observation_new, reward, done, _, _ = env.step(action[0].numpy()) # <- EDIT 2 HERE
episode_return += reward
episode_length += 1
# Get the value and log-probability of the action
value_t = critic(observation)
logprobability_t = logprobabilities(logits, action)
# Store obs, act, rew, v_t, logp_pi_t
buffer.store(observation, action, reward, value_t, logprobability_t)
# Update the observation
observation = observation_new
# Finish trajectory if reached to a terminal state
terminal = done
if terminal or (t == steps_per_epoch - 1):
last_value = 0 if done else critic(observation.reshape(1, -1))
buffer.finish_trajectory(last_value)
sum_return += episode_return
sum_length += episode_length
num_episodes += 1
observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- Here