Search code examples
pythontensorflowtf.kerasopenai-gympolicy-gradient-descent

Attribute error in PPO algorithm for Cartpole gym environment


I'm trying to run the code from here (Github link on this page): https://keras.io/examples/rl/ppo_cartpole/

I'm getting an attribute error in the training section from observation = observation.reshape(1,-1) which says "'tuple' object has no attribute 'reshape'".

It seems that observation is currently env.reset() which is a tuple of an array (initial observation) and an empty dictionary (info). I've tried applying this to just the array using observation[0].reshape(1,-1) or env.reset[0] but that throws up a "too many values to unpack (expected 4)" error two lines later. Does anyone know how I can fix this without messing up the rest of the code?

Minimal reproducible example as requested

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal

env = gym.make("CartPole-v0")

steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)

observation, episode_return, episode_length = env.reset(), 0, 0

for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes 
      for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
            if render:
                env.render()

            observation = observation.reshape(1, -1)
            logits, action = sample_action(observation)
            observation_new, reward, done, _ = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

where

def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

and

@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action

Solution

  • env.reset() returns observation and info, where info is empty. In our case, you can simply do:

    observation, info = env.reset()
    

    Edit:

    When the environment is reset via env.reset() it returns only observation, info, but when you make steps in the environment via env.step(...), it returns 4 variables: observation, reward, done, info instead. Your typical training loop should look like this:

    for episode in range(10):
        observation, info = env.reset()
        done = False
        while not done:
             observation, reward, done, info = env.step(...)
    

    Fixed code

    It turns out that CartPole environment returns 5 variables instead of 4 when using env.step(). Here's the code:

    import numpy as np
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    import gym
    import scipy.signal
    
    def mlp(x, sizes, activation=tf.tanh, output_activation=None):
        # Build a feedforward neural network
        for size in sizes[:-1]:
            x = layers.Dense(units=size, activation=activation)(x)
        return layers.Dense(units=sizes[-1], activation=output_activation)(x)
    
    @tf.function
    def sample_action(observation):
        logits = actor(observation)
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
        return logits, action
    
    env = gym.make("CartPole-v0")
    
    steps_per_epoch = 4000
    epochs = 30
    hidden_sizes = (64, 64)
    
    observation_dimensions = env.observation_space.shape[0]
    num_actions = env.action_space.n
    
    observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
    logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
    actor = keras.Model(inputs=observation_input, outputs=logits)
    
    
    
    for epoch in range(epochs):
        # Initialize the sum of the returns, lengths and number of episodes 
        #  for each epoch
        observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- HERE
    
        sum_return = 0
        sum_length = 0
        num_episodes = 0
    
        for t in range(steps_per_epoch):
            if render:
                env.render()
    
            observation = observation.reshape(1, -1)
            logits, action = sample_action(observation)
            observation_new, reward, done, _, _ = env.step(action[0].numpy())  # <- EDIT 2 HERE
            episode_return += reward
            episode_length += 1
    
        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)
    
        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action, reward, value_t, logprobability_t)
    
        # Update the observation
        observation = observation_new
    
        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- Here