Усиление обучения, оставляя след - PullRequest
0 голосов
/ 15 ноября 2018

Я узнал об искусственном интеллекте (в основном через YouTube и другие онлайн-источники).Я смотрел видео об обучении подкреплению (проблема gridworld).

Приведенный код написан на python, и я не слишком знаком с python.Я решил изменить код, указанный для создания собственных сценариев использования.

Это код:

Learner.py:

import World
import threading
import time

discount = 0.3
actions = World.actions
states = []
Q = {}
for i in range(World.x):
    for j in range(World.y):
        states.append((i, j))

for state in states:
    temp = {}
    for action in actions:
        temp[action] = 0.1
        World.set_cell_score(state, action, temp[action])
    Q[state] = temp

for (i, j, c, w) in World.specials:
    for action in actions:
        Q[(i, j)][action] = w
        World.set_cell_score((i, j), action, w)


def do_action(action):
    s = World.player
    r = -World.score
    if action == actions[0]:
        World.try_move(0, -1)
    elif action == actions[1]:
        World.try_move(0, 1)
    elif action == actions[2]:
        World.try_move(-1, 0)
    elif action == actions[3]:
        World.try_move(1, 0)
    else:
        return
    s2 = World.player
    r += World.score
    return s, action, r, s2


def max_Q(s):
    val = None
    act = None
    for a, q in Q[s].items():
        if val is None or (q > val):
            val = q
            act = a
    return act, val


def inc_Q(s, a, alpha, inc):
    Q[s][a] *= 1 - alpha
    Q[s][a] += alpha * inc
    World.set_cell_score(s, a, Q[s][a])


def run():
    global discount
    time.sleep(1)
    alpha = 1
    t = 1
    while True:
        # Pick the right action
        s = World.player
        max_act, max_val = max_Q(s)
        (s, a, r, s2) = do_action(max_act)

        # Update Q
        max_act, max_val = max_Q(s2)
        inc_Q(s, a, alpha, r + discount * max_val)

        # Check if the game has restarted
        t += 1.0
        if World.has_restarted():
            World.restart_game()
            time.sleep(0.01)
            t = 1.0

        # Update the learning rate
        alpha = pow(t, -0.1)

        # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
        time.sleep(0.05)


t = threading.Thread(target=run)
t.daemon = True
t.start()
World.start_game()

World.py:

from tkinter import *
master = Tk()

triangle_size = 0.1
cell_score_min = -0.2
cell_score_max = 0.2
Width = 50
(x, y) = (16, 16)
actions = ["up", "down", "left", "right"]

board = Canvas(master, width=x*Width, height=y*Width)
player = (13, 0)
score = 100
restart = False
walk_reward = .01

walls = [(0, 0), (0, 1), (0, 2), (0, 3),(0, 4), (0, 5), (0, 6), (0, 7),(0, 8), (0, 9), (0, 10), (0, 11),(0, 12), (0, 13), (0, 14), (0, 15),
         (0, 0), (1, 0), (2, 0), (3, 0), (4, 0),(5, 0), (6, 0), (7, 0), (8, 0),(9, 0), (10, 0), (11, 0), (12, 0), (15, 0),
         (15, 0), (15, 1), (15, 2), (15, 3),(15, 4), (15, 5), (15, 6), (15, 7),(15, 8), (15, 9), (15, 10), (15, 11),(15, 12), (15, 13), (15, 14), (15, 15),
         (0, 15), (1, 15), (2, 15), (3, 15), (4, 15),(5, 15), (6, 15), (7, 15), (8, 15),(9, 15), (10, 15), (11, 15), (12, 15),(13,15),(14,15), (15, 15),
         (2, 3), (2, 4), (2, 5), (3, 3), (3, 4),(3, 5), (4, 3), (4, 4), (4, 5),
         (7, 3), (7, 4), (7, 5), (8, 3), (8, 4),(8, 5), (9, 3), (9, 4), (9, 5),
         (1, 14), (1, 13), (1, 12), (1, 11), (1, 10),(1, 9), (1, 8), 
         (2, 14), (2, 13), (2, 12), (2, 11), (2, 10),(2, 9), (2, 8), 
         (3, 10),(3, 9), (3, 8), 
         (4, 10),(4, 9), (4, 8), 
         (5, 10),(5, 9), (5, 8), 
         (6, 10),(6, 9), (6, 8), 
         (7, 10),(7, 9), (7, 8), 
         (8, 10),(8, 9), (8, 8), 
         (9, 10),(9, 9), (9, 8), 
         (10, 10),(10, 9), (10, 8), 
         (6, 13), (7, 13), (8, 13),
         (6, 12), (7, 12), (8, 12),
         ]
specials = [(14, 0, "green", -2)]
crum = [(player[0],player[1],"blue",-.05)]
cell_scores = {}


def create_triangle(i, j, action):
    if action == actions[0]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5)*Width, j*Width,
                                    fill="white", width=1)
    elif action == actions[1]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5)*Width, (j+1)*Width,
                                    fill="white", width=5)
    elif action == actions[2]:
        return board.create_polygon((i+triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    i*Width, (j+0.5)*Width,
                                    fill="white", width=1)
    elif action == actions[3]:
        return board.create_polygon((i+1-triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+1-triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    (i+1)*Width, (j+0.5)*Width,
                                    fill="white", width=1)


def render_grid():
    global specials, walls, Width, x, y, player,crum
    for i in range(x):
        for j in range(y):
            board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill="white", width=1)
            temp = {}
            for action in actions:
                temp[action] = create_triangle(i, j, action)
            cell_scores[(i,j)] = temp
    for (i, j, c, w) in specials:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill=c, width=1)
    for (i, j) in walls:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill="black", width=1)
    for (m, h,c,w) in crum:
        board.create_rectangle(m*Width,  h*Width, (m+1)*Width, ( h+1)*Width, fill=c, width=1)

render_grid()


def set_cell_score(state, action, val):
    global cell_score_min, cell_score_max
    triangle = cell_scores[state][action]
    green_dec = int(min(255, max(0, (val - cell_score_min) * 255.0 / (cell_score_max - cell_score_min))))
    green = hex(green_dec)[2:]
    red = hex(255-green_dec)[2:]
    if len(red) == 1:
        red += "0"
    if len(green) == 1:
        green += "0"
    color = "#" + red + green + "00"
    board.itemconfigure(triangle, fill=color)


def try_move(dx, dy):
    global player, x, y, score, walk_reward, me, restart
    if restart == True:
        restart_game()
    new_x = player[0] + dx
    new_y = player[1] + dy
    score += walk_reward
    if (new_x >= 0) and (new_x < x) and (new_y >= 0) and (new_y < y) and not ((new_x, new_y) in walls):
        board.coords(me, new_x*Width+Width*2/10, new_y*Width+Width*2/10, new_x*Width+Width*8/10, new_y*Width+Width*8/10)
        player = (new_x, new_y)
    for (i, j, c, w) in specials:
        if new_x == i and new_y == j:
            score -= walk_reward
            score += w
            if score > 0:
                print ("Success! score: ", score)
            else:
                print ("Fail! score: ", score)
            restart = True
            return

    for (m, h, c, w) in crum:
        if new_x == m and new_y == h:
            score -= walk_reward
            score += w
            if score > 0:
                print ("Success! score: ", score)
            else:
                print ("Fail! score: ", score)
            restart = True
            return
    #print "score: ", score


def call_up(event):
    try_move(0, -1)


def call_down(event):
    try_move(0, 1)


def call_left(event):
    try_move(-1, 0)


def call_right(event):
    try_move(1, 0)


def restart_game():
    global player, score, me, restart
    player = (13, 0)
    score = 1
    restart = False
    board.coords(me, player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, player[0]*Width+Width*8/10, player[1]*Width+Width*8/10)

def has_restarted():
    return restart

master.bind("<Up>", call_up)
master.bind("<Down>", call_down)
master.bind("<Right>", call_right)
master.bind("<Left>", call_left)

me = board.create_rectangle(player[0]*Width+Width*2/10, player[1]*Width+Width*2/10,
                            player[0]*Width+Width*8/10, player[1]*Width+Width*8/10, fill="orange", width=1, tag="me")

board.grid(row=0, column=0)


def start_game():
    master.mainloop()

Я пытаюсь заставить агента оставить след, преобразовав белые состояния в синие и применяя очень низкую награду, чтобы отговорить их посещать те же самыесостояния.В коде, который я модифицировал, он только преобразует первый квадрат (тот, на котором он начинается) в синий, но ни одно из состояний после этого.Я хотел бы, чтобы он преобразовал пробелы в синий после того, как он покинет штат.

Если кто-то мог бы показать мне, что я делаю неправильно в моем подходе, и дать совет / помощь в выполнении моей задачи оставить следЯ был бы очень признателен.crum - это переменная, которую я использую для следа.

...