Я пытаюсь решить проблему mountaincar-v0 и не могу добиться успеха с ней, и я хочу понять, почему. У меня есть два .py файла: game.py, где я инкапсулирую взаимодействие со средой, и main.py, где я пытаюсь учиться. Что я пробовал:
- Различные значения learning_rate и discount_factor
- Различные функции quantize_state
- Различные функции epsilon
- Функция вознаграждения "Взломать"
И я не могу достичь цели. Содержит ли мой код какие-либо серьезные проблемы или мое понимание q-learning неверно? Спасибо за вашу помощь
game.py
import gym
env = gym.make("MountainCar-v0")
class Feedback:
def __init__(self, elements):
self.elements = elements
def max_position(self):
return max(self.elements, key=lambda x: x.finish_state[0]).finish_state[0]
class FeedbackElement:
def __init__(self, start_state, action, reward, finish_state):
self.start_state = start_state
self.action = action
self.reward = reward
self.finish_state = finish_state
def play_games(strategy, count):
result = []
for i in range(count):
result.append(play_game(strategy, False))
return result
def update_reward(reward, new_state):
return reward
def play_game(strategy, render):
feedback_elements = []
state = quantize_state(env.reset())
current_move = 0
while True:
if render:
env.render()
action = strategy.action(state)
new_state, reward, done, info = env.step(action)
current_move += 1
new_state = quantize_state(new_state)
reward = update_reward(reward, new_state)
feedback_elements.append(FeedbackElement(state, action, reward, new_state))
state = new_state
if done:
return Feedback(feedback_elements)
def quantize_state(state):
return round(state[0], 4), round(state[1], 2)
main.py
import game
import random
import time
import numpy as np
import functools
import matplotlib.pyplot as plt
class Strategy:
def __init__(self):
self.epochs = 100
self.current_epoch = 0
self.q = dict()
self.is_learning = False
def play(self):
while True:
feedback = game.play_game(self, True)
print(feedback.max_position())
def learning(self):
self.is_learning = True
self.current_epoch = 0
self.q = dict()
average_maxes = []
for i in range(self.epochs):
start = time.time()
feedback = game.play_games(self, 100)
max_sum = functools.reduce(lambda a, b: a + b.max_position(), feedback, feedback[0].max_position())
average_maxes.append(max_sum / len(feedback))
self.update_q([element for f in feedback for element in f.elements])
self.current_epoch += 1
print(self.current_epoch)
end = time.time()
print(end - start)
plt.plot(average_maxes)
plt.show()
self.is_learning = False
def update_q(self, feedback):
random.shuffle(feedback)
for f in feedback:
learning_rate = 0.9
discount_factor = 0.99
start_values = self.values_for_state(f.start_state)
end_values = self.values_for_state(f.finish_state)
self.q[f.start_state][f.action] = start_values[f.action] + learning_rate * (f.reward + discount_factor * max(end_values) - start_values[f.action])
def action(self, state):
if self.is_learning:
return self.epsilon_greedy_action(state)
else:
return self.greedy_action(state)
def epsilon_greedy_action(self, state):
epsilon = self.epsilon_value()
rnd = random.uniform(0, 1)
if epsilon > rnd:
return self.greedy_action(state)
else:
return random.randint(0, 2)
def epsilon_value(self):
return 0.9
def greedy_action(self, state):
values = self.values_for_state(state)
return np.argmax(values)
def values_for_state(self, state):
values = self.q.get(state)
if values:
return values
else:
self.q[state] = [0 for i in range(3)]
return self.q[state]
s = Strategy()
s.learning()
s.play()