python python-3.x machine-learning tensorflow reinforcement-learning

Deep Q score stuck at 9 for CartPole

So i am using a deepQ implementation using tensorflow to solve the CartPole-v0, however the output sometimes (40% of all runs) stays stuck at 9. I tried fixing the seed, using tf.set_random_seed, but that still doesn't ensure that the output wont be stuck. This is my code:

from collections import deque
import tensorflow as tf
import numpy as np
import random
import gym
import matplotlib.pyplot as plt
import pickle
from time import time
t = int(time())
class DQNAgent:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95
        #self.epsilon = 1.0
        #self.epsilon_min = 0.01
        #self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        graph = tf.Graph()
        with graph.as_default():
            inp = tf.placeholder(tf.float32, [None, self.state_size])
            out = tf.placeholder(tf.float32, [None, self.action_size])
            w1 = tf.Variable(tf.truncated_normal([self.state_size, 24]))
            b1 = tf.Variable(tf.zeros([24]))

            hidden = tf.nn.tanh(tf.matmul(inp, w1) + b1)

            w2 = tf.Variable(tf.truncated_normal([24, 24]))
            b2 = tf.Variable(tf.zeros([24]))

            hidden1 = tf.nn.tanh(tf.matmul(hidden, w2) + b2)

            w3 = tf.Variable(tf.truncated_normal([24, 24]))
            b3 = tf.Variable(tf.zeros([24]))

            hidden2 = tf.nn.tanh(tf.matmul(hidden1, w3) + b3)

            wo = tf.Variable(tf.truncated_normal([24, self.action_size]))
            bo = tf.Variable(tf.zeros([self.action_size]))

            prediction = tf.matmul(hidden2, wo) + bo

            loss = tf.losses.mean_squared_error(out, prediction)
            train = tf.train.AdamOptimizer().minimize(loss)
            init = tf.global_variables_initializer()

        return graph, inp, out, prediction, train, init

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, sess):
        act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
        return np.argmax(act_values[0])

    def replay(self, batch_size, sess):
        try:
            minibatch = random.sample(self.memory, batch_size)
        except ValueError:
            minibatch = self.memory
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(sess.run(self.model[3], feed_dict = { self.model[1]: next_state}))
            target_f = sess.run(self.model[3], feed_dict = { self.model[1]: state})
            target_f[0][action] = target
            #print(target_f)
            sess.run(self.model[4], feed_dict = { self.model[1]: state, self.model[2]: target_f})

if __name__ == "__main__":
    environment = 'CartPole-v0'
    env = gym.make(environment)
    avgs = deque(maxlen = 50)
    rewardLA = []
    agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)
    sess = tf.Session(graph = agent.model[0])
    sess.run(agent.model[5])
    episodes = 10000
    rewardL = []
    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, 4])
        for time_t in range(500):
            #env.render()
            action = agent.act(state, sess)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, 4])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        avgs.append(time_t)
        rewardLA.append(sum(avgs)/len(avgs))
        print("episode: ", e, "score: ", time_t)
        rewardL.append(time_t)
        agent.replay(32, sess)
    #pickle.dump(rewardL, open(environment + "_" + str(t) + "_rewardL.pickle", "wb"))
    plt.plot(rewardLA)
    plt.show()

I tried changing the optimiser to GD, rmsProp, but nothing works, but if i simply restart the code, it works better (gets to 199, in 200 epochs). Why is this happening? How do i fix it.

Solution

Looking at your code, I don't see how the environment is being explored. Don't you need something like epsilon greedy to ensure that exploration happens? For example, I tried modifying the agent.act() method as follows, and it seems to solve the problem.

def act(self, state, sess, episode):
    if random.random() < math.pow(2, -episode / 30):
        return env.action_space.sample()

    act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
    return np.argmax(act_values[0])

Play around with the 30, which, for lack of a better term,, I call the "exploration constant."

Anyway, it seems to me that without something like epsilon greedy (or something like the above that decays over time) you're relying on the neural network output to have enough entropy to cause sufficient exploration. Sometimes that may be the case; other times not.