I am currently reading Reinforcement Learning: An Introduction (RL:AI) and try to reproduce the first example with an n-armed bandit and simple reward averaging.
Averaging
new_estimate = current_estimate + 1.0 / step * (reward - current_estimate)
In order reproduce the graph from the PDF, I am generating 2000 bandit-plays and let different agents play 2000 bandits for 1000 steps (as described in the PDF) and then average the reward as well as the percentage of optimal actions.
In the PDF, the result looks like this:
However, I am not able to reproduce this. If I am using simple averaging, all the agents with exploration (epsilon > 0
) actually play worse than an agent without exploration. This is weird because the possibility of exploration should allow agents to leave the local optimum more often and reach out to better actions.
As you can see below, this is not the case for my implementation. Also note that I have added agents which use weighted-averaging. These work but even in that case, raising epsilon
results in a degradation of the agents performance.
Any ideas what's wrong in my code?
from abc import ABC
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing.pool import Pool
class Strategy(ABC):
def update_estimates(self, step: int, estimates: np.ndarray, action: int, reward: float):
raise NotImplementedError()
class Averaging(Strategy):
def __str__(self):
return 'avg'
def update_estimates(self, step: int, estimates: np.ndarray, action: int, reward: float):
current = estimates[action]
return current + 1.0 / step * (reward - current)
class WeightedAveraging(Strategy):
def __init__(self, alpha):
self.alpha = alpha
def __str__(self):
return 'weighted-avg_alpha=%.2f' % self.alpha
def update_estimates(self, step: int, estimates: List[float], action: int, reward: float):
current = estimates[action]
return current + self.alpha * (reward - current)
class Agent:
def __init__(self, nb_actions, epsilon, strategy: Strategy):
self.nb_actions = nb_actions
self.epsilon = epsilon
self.estimates = np.zeros(self.nb_actions)
self.strategy = strategy
def __str__(self):
return ','.join(['eps=%.2f' % self.epsilon, str(self.strategy)])
def get_action(self):
best_known = np.argmax(self.estimates)
if np.random.rand() < self.epsilon and len(self.estimates) > 1:
explore = best_known
while explore == best_known:
explore = np.random.randint(0, len(self.estimates))
return explore
return best_known
def update_estimates(self, step, action, reward):
self.estimates[action] = self.strategy.update_estimates(step, self.estimates, action, reward)
def reset(self):
self.estimates = np.zeros(self.nb_actions)
def play_bandit(agent, nb_arms, nb_steps):
agent.reset()
bandit_rewards = np.random.normal(0, 1, nb_arms)
rewards = list()
optimal_actions = list()
for step in range(1, nb_steps + 1):
action = agent.get_action()
reward = bandit_rewards[action] + np.random.normal(0, 1)
agent.update_estimates(step, action, reward)
rewards.append(reward)
optimal_actions.append(np.argmax(bandit_rewards) == action)
return pd.DataFrame(dict(
optimal_actions=optimal_actions,
rewards=rewards
))
def main():
nb_tasks = 2000
nb_steps = 1000
nb_arms = 10
fig, (ax_rewards, ax_optimal) = plt.subplots(2, 1, sharex='col', figsize=(8, 9))
pool = Pool()
agents = [
Agent(nb_actions=nb_arms, epsilon=0.00, strategy=Averaging()),
Agent(nb_actions=nb_arms, epsilon=0.01, strategy=Averaging()),
Agent(nb_actions=nb_arms, epsilon=0.10, strategy=Averaging()),
Agent(nb_actions=nb_arms, epsilon=0.00, strategy=WeightedAveraging(0.5)),
Agent(nb_actions=nb_arms, epsilon=0.01, strategy=WeightedAveraging(0.5)),
Agent(nb_actions=nb_arms, epsilon=0.10, strategy=WeightedAveraging(0.5)),
]
for agent in agents:
print('Agent: %s' % str(agent))
args = [(agent, nb_arms, nb_steps) for _ in range(nb_tasks)]
results = pool.starmap(play_bandit, args)
df_result = sum(results) / nb_tasks
df_result.rewards.plot(ax=ax_rewards, label=str(agent))
df_result.optimal_actions.plot(ax=ax_optimal)
ax_rewards.set_title('Rewards')
ax_rewards.set_ylabel('Average reward')
ax_rewards.legend()
ax_optimal.set_title('Optimal action')
ax_optimal.set_ylabel('% optimal action')
ax_optimal.set_xlabel('steps')
plt.xlim([0, nb_steps])
plt.show()
if __name__ == '__main__':
main()
In the formula for the update rule
new_estimate = current_estimate + 1.0 / step * (reward - current_estimate)
the parameter step
should be the number of times that the particular action
has been taken, not the overall step number of the simulation. So you need to store that variable alongside the action values in order to use it for the update.
This can also be seen from the pseudo-code box at the end of chapter 2.4 Incremental Implementation:
(source: Richard S. Sutton and Andrew G. Barto: Reinforcement Learning - An Introduction, second edition, 2018, Chapter 2.4 Incremental Implementation)