Search code examples
pythontensorflowmachine-learningreinforcement-learning

How can I save DDPG model?


I try to save the model using the saver method (I use the save function in the DDPG class to save), but when restoring the model, the result is far from the one I saved (I save the model when the episodic award is zero, the restor method in the code is commented out ) My code is below with all the features. I use Python 3.7, gym 0.16.0 and TensorFlow version 1.13.1

import tensorflow as tf
import numpy as np
import gym

epsiode_steps = 500

# learning rate for actor
lr_a = 0.001

# learning rate for critic
lr_c = 0.002

gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True

class DDPG(object):
    def __init__(self, no_of_actions, no_of_states, a_bound, ):
        self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)

        # initialize pointer to point to our experience buffer
        self.pointer = 0

        self.sess = tf.Session()

        # initialize the variance for OU process for exploring policies
        self.noise_variance = 3.0

        self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,

        self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
        self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
        self.reward = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
            a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)

        with tf.variable_scope('Critic'):
            q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
            q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)

        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')

        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # update target value
        self.soft_replace = [
            [tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
            for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]

        q_target = self.reward + gamma * q_

        # compute TD error i.e  actual - predicted values
        td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)

        # train the critic network with adam optimizer
        self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)

        a_loss = - tf.reduce_mean(q)

        # train the actor network with adam optimizer for minimizing the loss
        self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)

        tf.summary.FileWriter("logs2", self.sess.graph)

        # initialize all variables
        self.sess.run(tf.global_variables_initializer())

        # saver
        self.saver = tf.train.Saver()
        # self.saver.restore(self.sess, "Pendulum/nn.ckpt")  

    def choose_action(self, s):
        a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
        a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)

        return a

    def learn(self):
        # soft target replacement
        self.sess.run(self.soft_replace)

        indices = np.random.choice(memory, size=batch_size)
        batch_transition = self.memory[indices, :]
        batch_states = batch_transition[:, :self.no_of_states]
        batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
        batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
        batch_next_state = batch_transition[:, -self.no_of_states:]

        self.sess.run(self.atrain, {self.state: batch_states})
        self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
                                    self.next_state: batch_next_state})

    def store_transition(self, s, a, r, s_):
        trans = np.hstack((s, a, [r], s_))

        index = self.pointer % memory
        self.memory[index, :] = trans
        self.pointer += 1

        if self.pointer > memory:
            self.noise_variance *= 0.99995
            self.learn()

    def build_actor_network(self, s, scope, trainable):
        # Actor DPG
        with tf.variable_scope(scope):
            l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
            a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
            return tf.multiply(a, self.a_bound, name="scaled_a")

    def build_crtic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 30
            w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
            w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)

            q = tf.layers.dense(net, 1, trainable=trainable)
            return q

    def save(self):
        self.saver.save(self.sess, "Pendulum/nn.ckpt")

env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)

no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]

a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)

total_reward = []

# set the number of episodes
no_of_episodes = 300
for i in range(no_of_episodes):
    # initialize the environment
    s = env.reset()
    ep_reward = 0

    for j in range(epsiode_steps):
        env.render()

        # select action by adding noise through OU process
        a = ddpg.choose_action(s)

        # peform the action and move to the next state s
        s_, r, done, info = env.step(a)

        # store the the transition to our experience buffer
        # sample some minibatch of experience and train the network
        ddpg.store_transition(s, a, r, s_)

        # update current state as next state
        s = s_

        # add episodic rewards
        ep_reward += r

        if int(ep_reward) == 0 and i > 150:
            ddpg.save()
            print("save")
            quit()

        if j == epsiode_steps - 1:
            total_reward.append(ep_reward)
            print('Episode:', i, ' Reward: %i' % int(ep_reward))
            break

Solution

  • I solved this problem completely by rewriting the code and adding the learning function in a separate session