python tensorflow tf.keras openai-gym policy-gradient-descent

Attribute error in PPO algorithm for Cartpole gym environment

I'm trying to run the code from here (Github link on this page): https://keras.io/examples/rl/ppo_cartpole/

I'm getting an attribute error in the training section from observation = observation.reshape(1,-1) which says "'tuple' object has no attribute 'reshape'".

It seems that observation is currently env.reset() which is a tuple of an array (initial observation) and an empty dictionary (info). I've tried applying this to just the array using observation[0].reshape(1,-1) or env.reset[0] but that throws up a "too many values to unpack (expected 4)" error two lines later. Does anyone know how I can fix this without messing up the rest of the code?

Minimal reproducible example as requested

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal

env = gym.make("CartPole-v0")

steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)

observation, episode_return, episode_length = env.reset(), 0, 0

for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes 
      for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
            if render:
                env.render()

            observation = observation.reshape(1, -1)
            logits, action = sample_action(observation)
            observation_new, reward, done, _ = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

where

def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

and

@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action

Solution

env.reset() returns observation and info, where info is empty. In our case, you can simply do:

observation, info = env.reset()

Edit:

When the environment is reset via env.reset() it returns only observation, info, but when you make steps in the environment via env.step(...), it returns 4 variables: observation, reward, done, info instead. Your typical training loop should look like this:

for episode in range(10):
    observation, info = env.reset()
    done = False
    while not done:
         observation, reward, done, info = env.step(...)

Fixed code

It turns out that CartPole environment returns 5 variables instead of 4 when using env.step(). Here's the code:

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal

def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action

env = gym.make("CartPole-v0")

steps_per_epoch = 4000
epochs = 30
hidden_sizes = (64, 64)

observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)



for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes 
    #  for each epoch
    observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- HERE

    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in range(steps_per_epoch):
        if render:
            env.render()

        observation = observation.reshape(1, -1)
        logits, action = sample_action(observation)
        observation_new, reward, done, _, _ = env.step(action[0].numpy())  # <- EDIT 2 HERE
        episode_return += reward
        episode_length += 1

    # Get the value and log-probability of the action
    value_t = critic(observation)
    logprobability_t = logprobabilities(logits, action)

    # Store obs, act, rew, v_t, logp_pi_t
    buffer.store(observation, action, reward, value_t, logprobability_t)

    # Update the observation
    observation = observation_new

    # Finish trajectory if reached to a terminal state
    terminal = done
    if terminal or (t == steps_per_epoch - 1):
        last_value = 0 if done else critic(observation.reshape(1, -1))
        buffer.finish_trajectory(last_value)
        sum_return += episode_return
        sum_length += episode_length
        num_episodes += 1
        observation, episode_return, episode_length = env.reset()[0], 0, 0 # <-- Here