Застрял с проблемой Mountaincar-v0 OpenAI Gym (Q-learning) - PullRequest
0 голосов
/ 27 апреля 2020

Я пытаюсь решить проблему mountaincar-v0 и не могу добиться успеха с ней, и я хочу понять, почему. У меня есть два .py файла: game.py, где я инкапсулирую взаимодействие со средой, и main.py, где я пытаюсь учиться. Что я пробовал:

  • Различные значения learning_rate и discount_factor
  • Различные функции quantize_state
  • Различные функции epsilon
  • Функция вознаграждения "Взломать"

И я не могу достичь цели. Содержит ли мой код какие-либо серьезные проблемы или мое понимание q-learning неверно? Спасибо за вашу помощь

game.py

import gym


env = gym.make("MountainCar-v0")


class Feedback:
    def __init__(self, elements):
        self.elements = elements

    def max_position(self):
        return max(self.elements, key=lambda x: x.finish_state[0]).finish_state[0]


class FeedbackElement:
    def __init__(self, start_state, action, reward, finish_state):
        self.start_state = start_state
        self.action = action
        self.reward = reward
        self.finish_state = finish_state


def play_games(strategy, count):
    result = []

    for i in range(count):
        result.append(play_game(strategy, False))

    return result


def update_reward(reward, new_state):
    return reward


def play_game(strategy, render):
    feedback_elements = []
    state = quantize_state(env.reset())
    current_move = 0
    while True:
        if render:
            env.render()

        action = strategy.action(state)
        new_state, reward, done, info = env.step(action)
        current_move += 1
        new_state = quantize_state(new_state)
        reward = update_reward(reward, new_state)
        feedback_elements.append(FeedbackElement(state, action, reward, new_state))
        state = new_state

        if done:
            return Feedback(feedback_elements)


def quantize_state(state):
    return round(state[0], 4), round(state[1], 2)

main.py

import game
import random
import time
import numpy as np
import functools
import matplotlib.pyplot as plt


class Strategy:
    def __init__(self):
        self.epochs = 100
        self.current_epoch = 0
        self.q = dict()
        self.is_learning = False

    def play(self):
        while True:
            feedback = game.play_game(self, True)
            print(feedback.max_position())

    def learning(self):
        self.is_learning = True
        self.current_epoch = 0
        self.q = dict()

        average_maxes = []

        for i in range(self.epochs):
            start = time.time()
            feedback = game.play_games(self, 100)
            max_sum = functools.reduce(lambda a, b: a + b.max_position(), feedback, feedback[0].max_position())
            average_maxes.append(max_sum / len(feedback))
            self.update_q([element for f in feedback for element in f.elements])
            self.current_epoch += 1
            print(self.current_epoch)
            end = time.time()
            print(end - start)

        plt.plot(average_maxes)
        plt.show()

        self.is_learning = False

    def update_q(self, feedback):
        random.shuffle(feedback)
        for f in feedback:
            learning_rate = 0.9
            discount_factor = 0.99
            start_values = self.values_for_state(f.start_state)
            end_values = self.values_for_state(f.finish_state)
            self.q[f.start_state][f.action] = start_values[f.action] + learning_rate * (f.reward + discount_factor * max(end_values) - start_values[f.action])

    def action(self, state):
        if self.is_learning:
            return self.epsilon_greedy_action(state)
        else:
            return self.greedy_action(state)

    def epsilon_greedy_action(self, state):
        epsilon = self.epsilon_value()
        rnd = random.uniform(0, 1)
        if epsilon > rnd:
            return self.greedy_action(state)
        else:
            return random.randint(0, 2)

    def epsilon_value(self):
        return 0.9

    def greedy_action(self, state):
        values = self.values_for_state(state)
        return np.argmax(values)

    def values_for_state(self, state):
        values = self.q.get(state)
        if values:
            return values
        else:
            self.q[state] = [0 for i in range(3)]
            return self.q[state]


s = Strategy()
s.learning()
s.play()
...