I built a simulation model where trucks collect garbage containers based on their fill level. I used OpenAi Gym and Tensorflow/keras to create my Deep Reinforcement Learning model... But my training has a very high loss... Where did I go wrong? Thanks in advance
this is the Env
class Marltf(Env):
def __init__(self):
self.i= 0
self.containers1 = Container(3,3)
self.containers2 = Container(1,3)
self.containers3 = Container(3,1)
self.containers4 = Container(5,6)
self.containers5 = Container(8,6)
self.containers6 = Container(10,10)
self.containers7 = Container(11,11)
self.containers8 = Container(7,12)
self.passo = 0
self.containers2.lv = 2
self.containers3.lv = 4
self.containers5.lv = 4
self.containers6.lv = 1
self.containers8.lv = 2
self.shower_length= 300
self.containers = [self.containers1,self.containers2,self.containers3,self.containers4, self.containers5, self.containers6, self.containers7, self.containers8]
self.positions ={}
self.capacities ={}
self.camions= []
b = 0
for cont in self.containers:
b += cont.lv
reward = 0
nCamionFloat = 0
while b > 6:
b +=-10
nCamionFloat +=1
nCamionInt = int(nCamionFloat)
for ic in range(nCamionInt):
self.camions.append(Camion(1,1,None,ic))
for cam in self.camions:
self.positions[cam.name] = cam.position
self.capacities[cam.name] = 10
self.frames = []
self.cnt=0
self.mapp = Map(15,15,self.camions,self.containers)
self.state = (15*15)/5
self.action_space = gym.spaces.Discrete(4)
self.observation_space = Box(low = np.array([0]), high= np.array([51]))
def step(self, action):
moves = {0: (-1, 0),1: (1, 0),2: (0, -1),3: (0, 1)}
done = False
ic = 0
for cam in self.camions:
cam.position = (self.positions[ic][0],self.positions[ic][1])
cam.capacity = self.capacities[ic]
self.state += -5
mossa = moves[action]
x=self.camions[self.i].position
reward = 0
nuovaposizione = [mossa[0] + x[0],mossa[1] +x[1]]
self.shower_length -= 1
if self.mapp.mapp[nuovaposizione[0],nuovaposizione[1]] == -1:
reward += -5
self.state += -5
else:
self.mapp.mapp[x[0],x[1]] = 0
self.camions[self.i].position=nuovaposizione
self.mapp.mapp[nuovaposizione[0],nuovaposizione[1]] = 9
self.positions.update({self.camions[self.i].name : nuovaposizione})
reward += -1
self.state = -2
for contain in self.containers:
if self.camions[self.i].position[0] == contain.position[0] and camion.position[1] == contain.position[1] :
if contain.lv ==3 and self.camions[self.i].capacity >=3:
self.camions[self.i].reward += 100
self.camions[self.i].capacity += -3
self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
reward +=20
self.state +=20
contain.lv=0
elif contain.lv == 2 and self.camions[self.i].capacity >=2:
self.camions[self.i].reward += 50
self.camions[self.i].capacity += -2
self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
self.state +=10
reward += 50
contain.lv=0
elif contain.lv == 1 and self.camions[self.i].capacity >=1:
reward += 10
self.camions[self.i].reward +=5
self.camions[self.i].capacity += -1
self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
contain.lv=0
self.state+=1
elif contain.lv==4 and self.camions[self.i].capacity >=4:
reward +=50
self.camions[self.i].reward +=50
self.camions[self.i].capacity += -4
self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
self.state +=50
contain.lv=0
elif contain.lv==0 and self.camions[self.i].capacity >=4:
reward += -20
self.camions[self.i].reward +=-20
self.camions[self.i].capacity += 0
self.state += -20
contain.lv=0
if self.camions[self.i].capacity <=2:
self.camions[self.i].positions=(1,1)
self.positions.update({self.camions[self.i].name : (1,1)})
self.camions[self.i].capacity = 10
self.capacities.update({self.camions[self.i].name : self.camions[self.i].capacity})
if self.i ==1:
self.i= 0
self.i = 0
self.i = 0
elif self.i ==0:
self.i= 1
if self.shower_length <= 0:
done = True
else:
done = False
self.passo +=1
info = {}
return self.state,reward,done,info
def render(self, mode="human"):
BLACK = (0, 0, 0)
WHITE = (200, 200, 200)
WINDOW_HEIGHT = len(self.mapp.mapp[0]) *50
WINDOW_WIDTH = len(self.mapp.mapp[0]) *50
whiteC=pygame.image.load('white.jpg')
whiteC=pygame.transform.scale(whiteC,(50, 50))
greenC=pygame.image.load('green.jpg')
greenC=pygame.transform.scale(greenC,(50, 50))
yellowC=pygame.image.load('yellow.jpg')
yellowC=pygame.transform.scale(yellowC,(50, 50))
orangeC=pygame.image.load('orange.jpg')
orangeC=pygame.transform.scale(orangeC,(50, 50))
redC=pygame.image.load('red.jpg')
redC=pygame.transform.scale(redC,(50, 50))
gT=pygame.image.load('greenCamion.jpg')
gT=pygame.transform.scale(gT,(50, 50))
yT=pygame.image.load('yellowCamion.jpg')
yT=pygame.transform.scale(yT,(50, 50))
rT=pygame.image.load('redCamion.jpg')
rT=pygame.transform.scale(rT,(50, 50))
global SCREEN, CLOCK
pygame.init()
SCREEN = pygame.display.set_mode((WINDOW_WIDTH, WINDOW_HEIGHT))
CLOCK = pygame.time.Clock()
SCREEN.fill(BLACK)
pygame.draw.rect(SCREEN, WHITE, pygame.Rect( 10, 0, 50, 50))
blockSize = 50 #Set the size of the grid block
for i in range(0,len(self.mapp.mapp[0])):
for j in range(0,len(self.mapp.mapp[0])):
a=i*50
b=j*50
if self.mapp.mapp[i][j] == -1:
pygame.draw.rect(SCREEN, WHITE, pygame.Rect( a, b, 50, 50))
for c in self.camions :
if c.capacity > 6:
SCREEN.blit(gT, (c.position[0]*50, c.position[1]*50))
if c.capacity > 3 and c.capacity <= 6:
SCREEN.blit(yT, (c.position[0]*50, c.position[1]*50))
if c.capacity <= 3:
SCREEN.blit(rT, (c.position[0]*50, c.position[1]*50))
for contain in self.containers :
if contain.lv == 0:
SCREEN.blit(whiteC,(contain.position[0]*50 , contain.position[1]*50))
elif contain.lv == 1:
SCREEN.blit(greenC,(contain.position[0]*50 , contain.position[1]*50))
elif contain.lv == 2:
SCREEN.blit(yellowC,(contain.position[0]*50 , contain.position[1]*50))
elif contain.lv == 3:
SCREEN.blit(orangeC,(contain.position[0]*50 , contain.position[1]*50))
if contain.lv == 4:
SCREEN.blit(redC,(contain.position[0]*50 , contain.position[1]*50))
for x in range(0, WINDOW_WIDTH, blockSize):
for y in range(0, WINDOW_HEIGHT, blockSize):
rect = pygame.Rect(x, y, blockSize, blockSize)
pygame.draw.rect(SCREEN, WHITE, rect, 1)
pygame.display.flip()
view = pygame.surfarray.array3d(SCREEN)
view = view.transpose([1, 0, 2])
img_bgr = cv2.cvtColor(view, cv2.COLOR_RGB2BGR)
pygame.image.save(SCREEN, f"screenshot{self.cnt}.png")
self.cnt +=1
pygame.event.get()
def reset(self):
self.state = (15*15)/4
self.shower_length = 300
self.containers1.lv=3
self.containers2.lv=1
self.containers7.lv = 2
self.containers3.lv = 4
self.containers5.lv = 4
self.containers6.lv = 1
self.containers8.lv = 2
self.passo = 0
self.positions ={}
self.capacities ={}
self.camions= []
b = 0
for cont in self.containers:
b += cont.lv
reward = 0
nCamionFloat = 0
while b > 6:
b +=-10
nCamionFloat +=1
nCamionInt = int(nCamionFloat)
for ic in range(nCamionInt):
self.camions.append(Camion(1,1,None,ic))
for cam in self.camions:
self.positions[cam.name] = cam.position
self.capacities[cam.name] = 10
self.shower_length =60
self.cnt=0
self.i = 0
containers = [ containers1, containers2, containers3, containers4]
containers.append( containers1)
states = env.observation_space.shape
actions = env.action_space.n
b = env.action_space.sample()
My model
def build_model(states,actions):
model = tf.keras.Sequential([
keras.layers.Dense(64, input_shape=states),
keras.layers.LeakyReLU(0.24,),
keras.layers.Dense(64),
keras.layers.LeakyReLU(0.24,),
keras.layers.Dense(32),
keras.layers.LeakyReLU(0.24,),
keras.layers.Dense(16),
keras.layers.LeakyReLU(0.24,),
keras.layers.Dense(8),
keras.layers.LeakyReLU(0.24,),
keras.layers.Dense(actions, activation='linear'),
])
return model
model = build_model(states, actions)
model.compile(loss='mse', metrics=['accuracy'])
def build_agent(model, actions):
policy = GreedyQPolicy()
memory = SequentialMemory(limit=10000, window_length=1)
dqn = DQNAgent(model=model, memory=memory, policy=policy,nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
return dqn
dqn = build_agent(model, actions)
dqn.compile(tf.keras.optimizers.Adadelta(
learning_rate=0.1, rho=0.95, epsilon=1e-07, name='Adadelta'), metrics= ["accuracy"]
)
a =dqn.fit(env, nb_steps=5000, visualize=True, verbose=2,)
the loss starts from 50 and reaches 200
In reinforcement learning you usually don't care about loss, but rewards. From the class name, it looks like it also is a multi agent reinforcement learning problem, which are usually more difficult to deal with w.r.t single agent problems.
The first thing that I would try to change is the number of steps: 5000 is very low. Try to define, if it is not already defined, an episode, then plot the cumulative reward at the end of the episode, and check if the cumulative reward increases as the number of episodes increase.
This is the cleanest way to check if the reward is actually increasing and the agent is learning something.