Reinforcement Learning finale (Part 7)
Final part of the series is here. Hope you have enjoyed as much as I have enjoyed making this.
We are now going to train our DQN model.
System requirements :-
- TensorFlow : 2.4.1
- Dedicated GPU
⚠️ Warning : this program has a high computational requirements. Please do not train on a CPU.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.callbacks import TensorBoard
from keras.optimizers import Adam
from collections import deque
import time
import numpy as np
from tqdm import tqdm
import random
import os
from PIL import Image as Img
import cv2
REPLAY_MEMORY_SIZE = 50_000
MODEL_NAME = "256x2"
MIN_REPLAY_MEMORY_SIZE = 1_000
MINIBATCH_SIZE = 64 # batch size for training data
DISCOUNT = 0.99
UPDATE_TARGET_EVERY = 5
MIN_REWARD = -200
EPISODES = 20_000
epsilon = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001
AGGREGATE_STATS_EVERY = 100 # see stats every 100 episodes
SHOW_PREVIEW = False
💢 Blob class
Now we want to bring in our blob class and blob environment.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
class Blob:
def __init__(self, size):
self.size = size
self.x = np.random.randint(0, size)
self.y = np.random.randint(0, size)
def __str__(self):
return f"Blob ({self.x}, {self.y})"
def __sub__(self, other):
return (self.x-other.x, self.y-other.y)
def __eq__(self, other):
return self.x == other.x and self.y == other.y
We have 9 total movement options : 0, 1, 2, 3, 4, 5, 6, 7, 8
So, now our agent doesn’t need to take help of the boundaries to go up-down or left-right :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def action(self, choice):
if choice == 0:
self.move(x=1, y=1)
elif choice == 1:
self.move(x=-1, y=-1)
elif choice == 2:
self.move(x=-1, y=1)
elif choice == 3:
self.move(x=1, y=-1)
elif choice == 4:
self.move(x=1, y=0)
elif choice == 5:
self.move(x=-1, y=0)
elif choice == 6:
self.move(x=0, y=1)
elif choice == 7:
self.move(x=0, y=-1)
elif choice == 8:
self.move(x=0, y=0)
Updated method ‘move’ for TF 2.4.1 :-
1
def move(self, x = None, y = None):
If no value for x, move randomly :
1
2
3
4
if x == None:
self.x += np.random.randint(-1, 2)
else:
self.x += x
If no value for y, move randomly :
1
2
3
4
if y == None:
self.y += np.random.randint(-1, 2)
else:
self.y += y
If we are out of bounds, fix :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
if self.x < 0:
self.x = 0
elif self.x > self.size-1:
self.x = self.size-1
if self.y < 0:
self.y = 0
elif self.y > self.size-1:
self.y = self.size-1
class BlobEnv:
SIZE = 10
RETURN_IMAGES = True
MOVE_PENALTY = 1
ENEMY_PENALTY = 300
FOOD_REWARD = 25
OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3) # 4
ACTION_SPACE_SIZE = 9
Player key in dict :
1
PLAYER_N = 1
Food key in dict :
1
FOOD_N = 2
Enemy key in dict :
1
ENEMY_N = 3
The dict (colours) :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
d = {1: (255, 175, 0),
2: (0, 255, 0),
3: (0, 0, 255)}
def reset(self):
self.player = Blob(self.SIZE)
self.food = Blob(self.SIZE)
while self.food == self.player:
self.food = Blob(self.SIZE)
self.enemy = Blob(self.SIZE)
while self.enemy == self.player or self.enemy == self.food:
self.enemy = Blob(self.SIZE)
self.episode_step = 0
if self.RETURN_IMAGES:
observation = np.array(self.get_image())
else:
observation = (self.player-self.food) + (self.player-self.enemy)
return observation
def step(self, action):
self.episode_step += 1
self.player.action(action)
💢 Movement options
To make the food and enemy move, we can use :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# self.enemy.move()
# self.food.move()
if self.RETURN_IMAGES:
new_observation = np.array(self.get_image())
else:
new_observation = (self.player-self.food) + (self.player-self.enemy)
if self.player == self.enemy:
reward = -self.ENEMY_PENALTY
elif self.player == self.food:
reward = self.FOOD_REWARD
else:
reward = -self.MOVE_PENALTY
done = False
if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
done = True
return new_observation, reward, done
def render(self):
img = self.get_image()
💢 Scaling our agent
1
img = img.resize((300, 300))
Show it :
1
2
cv2.imshow("image", np.array(img))
cv2.waitKey(1)
💢 FOR CNN
1
def get_image(self):
Starts an rbg of our size :
1
env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)
Sets the food location tile to green color :
1
env[self.food.x][self.food.y] = self.d[self.FOOD_N]
Sets the enemy location to red :
1
env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
Sets the player tile to blue :
1
env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
Reading to rgb, even tho color definitions are bgr :
1
2
3
img = Img.fromarray(env, 'RGB')
return img
💢 Initailizing environment
1
env = BlobEnv()
For stats :
1
ep_rewards = [-200]
For more repetitive results :
1
2
3
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)
Create models folder :
1
2
if not os.path.isdir('models'):
os.makedirs('models')
💢 TensorFlow 2.4.1
Following class is modified to work with TensorFlow 2.4.1
Please do not change this class in any manner, or the program may not work at all. If you face some issue regarding this class, please create a discussion.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class ModifiedTensorBoard(TensorBoard):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.step = 1
self.writer = tf.summary.create_file_writer(self.log_dir)
self._log_write_dir = self.log_dir
def set_model(self, model):
self.model = model
self._train_dir = os.path.join(self._log_write_dir, 'train')
self._train_step = self.model._train_counter
self._val_dir = os.path.join(self._log_write_dir, 'validation')
self._val_step = self.model._test_counter
self._should_write_train_graph = False
def on_epoch_end(self, epoch, logs=None):
self.update_stats(**logs)
def on_batch_end(self, batch, logs=None):
pass
def on_train_end(self, _):
pass
def update_stats(self, **stats):
with self.writer.as_default():
for key, value in stats.items():
tf.summary.scalar(key, value, step = self.step)
self.writer.flush()
Following class is already explained in previous codes. Please go through them if you face any issues.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class DQNAgent:
def __init__(self):
self.model = self.create_model()
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE)
self.tensorboard = ModifiedTensorBoard(log_dir = f"logs/{MODEL_NAME}-{int(time.time())}")
self.target_update_counter = 0
def create_model(self):
model = Sequential()
model.add(Conv2D(256, (3, 3), input_shape = env.OBSERVATION_SPACE_VALUES))
model.add(Activation("relu"))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
model.add(Conv2D(256, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation = "linear"))
model.compile(loss="mse", optimizer = Adam(lr=0.001), metrics=['accuracy'])
return model
def update_replay_memory(self, transition):
self.replay_memory.append(transition)
def get_qs(self, state):
return self.model.predict(np.array(state).reshape(-1, *state.shape) / 255)[0]
def train(self, terminal_state, step):
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
💢 Teaching CNNs
By dividing in the following statement, we are trying to scale the images between 0 and 1 because that is the best way to teach convolutional neural networks :
1
2
3
current_states = np.array([transition[0] for transition in minibatch]) / 255
current_qs_list = self.model.predict(current_states)
Current states after actions are taken :
1
2
new_current_states = np.array([transition[3] for transition in minibatch]) / 255
future_qs_list = self.target_model.predict(new_current_states)
Following list will be the images from the game :
1
X = []
Following list will be the actions that model decides to take :
1
y = []
💢 Calculate learned value
With the following loop, we will be able to calculate the last bit (learned value) of the Q-value formula :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
if not done:
max_future_q = np.max(future_qs_list[index])
new_q = reward + DISCOUNT * max_future_q
else:
new_q = reward
current_qs = current_qs_list[index]
current_qs[action] = new_q
X.append(current_state)
y.append(current_qs)
self.model.fit(np.array(X) / 255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle = False,
We will fit only if we are on the terminal state :
1
callbacks = [self.tensorboard] if terminal_state else None)
Whether we want to update the target_model yet :
1
2
3
4
5
6
if terminal_state:
self.target_update_counter += 1
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
💢 Agent creation
Create agent :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
agent = DQNAgent()
# Now we are ready to iterate over everything.
for episode in tqdm(range(1, EPISODES + 1), ascii = True, unit = "episode"):
agent.tensorboard.step = episode
episode_reward = 0
step = 1
current_state = env.reset()
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(current_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done = env.step(action)
episode_reward += reward
if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
env.render()
agent.update_replay_memory((current_state, action, reward, new_state, done))
agent.train(done, step)
current_state = new_state
step += 1
💢 Episode rewards
Now we are going to append episode reward and then we will grab various aggregate stats. Then we will create a matplotlib chart with those values.
Append episode reward to a list and log stats (every given number of episodes) :
1
2
3
4
5
6
ep_rewards.append(episode_reward)
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
Save model, but only when min reward is greater or equal a set value :
1
2
if min_reward >= MIN_REWARD:
agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
Decay epsilon :
1
2
3
4
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)
💢 Entire code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.callbacks import TensorBoard
from keras.optimizers import Adam
from collections import deque
import time
import numpy as np
from tqdm import tqdm
import random
import os
from PIL import Image as Img
import cv2
REPLAY_MEMORY_SIZE = 50_000
MODEL_NAME = "256x2"
MIN_REPLAY_MEMORY_SIZE = 1_000
MINIBATCH_SIZE = 64 # batch size for training data
DISCOUNT = 0.99
UPDATE_TARGET_EVERY = 5
MIN_REWARD = -200
EPISODES = 20_000
epsilon = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001
AGGREGATE_STATS_EVERY = 100 # number of episodes to see stats = 100
SHOW_PREVIEW = False
class Blob:
def __init__(self, size):
self.size = size
self.x = np.random.randint(0, size)
self.y = np.random.randint(0, size)
def __str__(self):
return f"Blob ({self.x}, {self.y})"
def __sub__(self, other):
return (self.x-other.x, self.y-other.y)
def __eq__(self, other):
return self.x == other.x and self.y == other.y
def action(self, choice):
if choice == 0:
self.move(x=1, y=1)
elif choice == 1:
self.move(x=-1, y=-1)
elif choice == 2:
self.move(x=-1, y=1)
elif choice == 3:
self.move(x=1, y=-1)
elif choice == 4:
self.move(x=1, y=0)
elif choice == 5:
self.move(x=-1, y=0)
elif choice == 6:
self.move(x=0, y=1)
elif choice == 7:
self.move(x=0, y=-1)
elif choice == 8:
self.move(x=0, y=0)
def move(self, x = None, y = None):
if x == None:
self.x += np.random.randint(-1, 2)
else:
self.x += x
if y == None:
self.y += np.random.randint(-1, 2)
else:
self.y += y
if self.x < 0:
self.x = 0
elif self.x > self.size-1:
self.x = self.size-1
if self.y < 0:
self.y = 0
elif self.y > self.size-1:
self.y = self.size-1
class BlobEnv:
SIZE = 10
RETURN_IMAGES = True
MOVE_PENALTY = 1
ENEMY_PENALTY = 300
FOOD_REWARD = 25
OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3) # 4
ACTION_SPACE_SIZE = 9
PLAYER_N = 1
FOOD_N = 2
ENEMY_N = 3
d = {1: (255, 175, 0),
2: (0, 255, 0),
3: (0, 0, 255)}
def reset(self):
self.player = Blob(self.SIZE)
self.food = Blob(self.SIZE)
while self.food == self.player:
self.food = Blob(self.SIZE)
self.enemy = Blob(self.SIZE)
while self.enemy == self.player or self.enemy == self.food:
self.enemy = Blob(self.SIZE)
self.episode_step = 0
if self.RETURN_IMAGES:
observation = np.array(self.get_image())
else:
observation = (self.player-self.food) + (self.player-self.enemy)
return observation
def step(self, action):
self.episode_step += 1
self.player.action(action)
if self.RETURN_IMAGES:
new_observation = np.array(self.get_image())
else:
new_observation = (self.player-self.food) + (self.player-self.enemy)
if self.player == self.enemy:
reward = -self.ENEMY_PENALTY
elif self.player == self.food:
reward = self.FOOD_REWARD
else:
reward = -self.MOVE_PENALTY
done = False
if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
done = True
return new_observation, reward, done
def render(self):
img = self.get_image()
img = img.resize((300, 300))
cv2.imshow("image", np.array(img))
cv2.waitKey(1)
def get_image(self):
env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)
env[self.food.x][self.food.y] = self.d[self.FOOD_N]
env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
img = Img.fromarray(env, 'RGB')
return img
env = BlobEnv()
ep_rewards = [-200]
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)
# Create models folder
if not os.path.isdir('models'):
os.makedirs('models')
class ModifiedTensorBoard(TensorBoard):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.step = 1
self.writer = tf.summary.create_file_writer(self.log_dir)
self._log_write_dir = self.log_dir
def set_model(self, model):
self.model = model
self._train_dir = os.path.join(self._log_write_dir, 'train')
self._train_step = self.model._train_counter
self._val_dir = os.path.join(self._log_write_dir, 'validation')
self._val_step = self.model._test_counter
self._should_write_train_graph = False
def on_epoch_end(self, epoch, logs=None):
self.update_stats(**logs)
def on_batch_end(self, batch, logs=None):
pass
def on_train_end(self, _):
pass
def update_stats(self, **stats):
with self.writer.as_default():
for key, value in stats.items():
tf.summary.scalar(key, value, step = self.step)
self.writer.flush()
class DQNAgent:
def __init__(self):
self.model = self.create_model()
self.target_model = self.create_model()
self.target_model.set_weights(self.model.get_weights())
self.replay_memory = deque(maxlen = REPLAY_MEMORY_SIZE)
self.tensorboard = ModifiedTensorBoard(log_dir = f"logs/{MODEL_NAME}-{int(time.time())}")
self.target_update_counter = 0
def create_model(self):
model = Sequential()
model.add(Conv2D(256, (3, 3), input_shape = env.OBSERVATION_SPACE_VALUES))
model.add(Activation("relu"))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
model.add(Conv2D(256, (3, 3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(env.ACTION_SPACE_SIZE, activation = "linear"))
model.compile(loss="mse", optimizer = Adam(lr=0.001), metrics=['accuracy'])
return model
def update_replay_memory(self, transition):
self.replay_memory.append(transition)
def get_qs(self, state):
return self.model.predict(np.array(state).reshape(-1, *state.shape) / 255)[0]
def train(self, terminal_state, step):
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
return
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
current_states = np.array([transition[0] for transition in minibatch]) / 255
current_qs_list = self.model.predict(current_states)
new_current_states = np.array([transition[3] for transition in minibatch]) / 255
future_qs_list = self.target_model.predict(new_current_states)
X = []
y = []
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
if not done:
max_future_q = np.max(future_qs_list[index])
new_q = reward + DISCOUNT * max_future_q
else:
new_q = reward
current_qs = current_qs_list[index]
current_qs[action] = new_q
X.append(current_state)
y.append(current_qs)
self.model.fit(np.array(X) / 255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle = False,
callbacks = [self.tensorboard] if terminal_state else None)
if terminal_state:
self.target_update_counter += 1
if self.target_update_counter > UPDATE_TARGET_EVERY:
self.target_model.set_weights(self.model.get_weights())
self.target_update_counter = 0
agent = DQNAgent()
for episode in tqdm(range(1, EPISODES + 1), ascii = True, unit = "episode"):
agent.tensorboard.step = episode
episode_reward = 0
step = 1
current_state = env.reset()
done = False
while not done:
if np.random.random() > epsilon:
action = np.argmax(agent.get_qs(current_state))
else:
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
new_state, reward, done = env.step(action)
episode_reward += reward
if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
env.render()
agent.update_replay_memory((current_state, action, reward, new_state, done))
agent.train(done, step)
current_state = new_state
step += 1
ep_rewards.append(episode_reward)
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
if min_reward >= MIN_REWARD:
agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
if epsilon > MIN_EPSILON:
epsilon *= EPSILON_DECAY
epsilon = max(MIN_EPSILON, epsilon)