q-learning formula explained code example
Example: q learning algorithm
if start_q_table is None:
q_table = {}
for i in range(-SIZE+1, SIZE):
for ii in range(-SIZE+1, SIZE):
for iii in range(-SIZE+1, SIZE):
for iiii in range(-SIZE+1, SIZE):
q_table[((i, ii), (iii, iiii))] = [np.random.uniform(-5, 0) for i in range(4)]
else:
with open(start_q_table, "rb") as f:
q_table = pickle.load(f)
episode_rewards = []
for episode in range(HM_EPISODES):
player = Blob()
food = Blob()
enemy = Blob()
if episode % SHOW_EVERY == 0:
print(f"on #{episode}, epsilon is {epsilon}")
print(f"{SHOW_EVERY} ep mean: {np.mean(episode_rewards[-SHOW_EVERY:])}")
show = True
else:
show = False
episode_reward = 0
for i in range(200):
obs = (player-food, player-enemy)
if np.random.random() > epsilon:
action = np.argmax(q_table[obs])
else:
action = np.random.randint(0, 4)
player.action(action)
if player.x == enemy.x and player.y == enemy.y:
reward = -ENEMY_PENALTY
elif player.x == food.x and player.y == food.y:
reward = FOOD_REWARD
else:
reward = -MOVE_PENALTY
new_obs = (player-food, player-enemy)
max_future_q = np.max(q_table[new_obs])
current_q = q_table[obs][action]
if reward == FOOD_REWARD:
new_q = FOOD_REWARD
else:
new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
q_table[obs][action] = new_q
if show:
env = np.zeros((SIZE, SIZE, 3), dtype=np.uint8)
env[food.x][food.y] = d[FOOD_N]
env[player.x][player.y] = d[PLAYER_N]
env[enemy.x][enemy.y] = d[ENEMY_N]
img = Image.fromarray(env, 'RGB')
img = img.resize((300, 300))
cv2.imshow("image", np.array(img))
if reward == FOOD_REWARD or reward == -ENEMY_PENALTY:
if cv2.waitKey(500) & 0xFF == ord('q'):
break
else:
if cv2.waitKey(1) & 0xFF == ord('q'):
break
episode_reward += reward
if reward == FOOD_REWARD or reward == -ENEMY_PENALTY:
break