Алгоритм глубокого обучения Q Один и тот же выход для каждого входа - PullRequest
0 голосов
/ 11 марта 2020

Я пытаюсь выполнить простую задачу обучения с подкреплением. У меня есть сетка 10 х 10 и поле, занимающее 4 квадрата в сетке. Я хочу, чтобы алгоритм переместил прямоугольник в центр сетки. Я даю вознаграждение 30, если алгоритм перемещает прямоугольник ближе к центру по x и y, 10, если алгоритм перемещает прямоугольник ближе только по x или y, -10, если алгоритм перемещает прямоугольник дальше по x и y, и 100 - алгоритм перемещает коробку на цель. Я взял алгоритм по этой ссылке: https://towardsdatascience.com/deep-reinforcement-learning-build-a-deep-q-network-dqn-to-play-cartpole-with-tensorflow-2-and-gym-8e105744b998

Алгоритм выдает одинаковые выходные данные для каждого входа. Но на тренировках наблюдается положительная линейная тенденция в вознаграждении. Если бы кто-нибудь мог предоставить какое-либо понимание, я был бы очень признателен!

Вот код для моей среды:

ошибка импорта спортзала из спортзала, ошибки импорта пробелов, утилит из gym.utils семян

импорт sys из contextlib, закрытие импорта из шести, импорт StringIO из спортзала, утилиты импорта из gym.envs.toy_text импорт, дискретный импорт numpy как np из случайного импорта, randint из gym.utils, заполнение импорта

MAP = ["_ _ _ _ _ _ _ _ _ _", "| | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | "," | | | | | | | | | | | ",]

класс BoxV2DEnv (discrete.DiscreteEnv):" "" Проблема упаковки V1

Description: The goal is for the agent to move the box to the center of the design domain.
            The box starts off in a random position. The box occupies 4 "squares" of the discretized domain.
            The design domain is 10 x 10.
            There are 81 box positions which means there are 81 discrete states.
            There is one optimal state (Center of design domain)
            The episode ends when the box reaches the middle position.
Destination: Center of design domain (10,10)
Box: Position defined by the top left corner, built from there
Actions: 4 discrete deterministic actions
        0: move down
        1: move up
        2: Move right
        3: Move left
        4: Move diagonal up, left
        5: Move diagonal up, right
        6: Move diagonal, down, left
        7: Move diagonal, down, right
Rewards: 
        There is a reward of +10 for getting closer to the center in one direction (x or y)
        There is a reward of +30 for getting closer to the center in both directions (x and y)
        There is a reward of -10 for getting farther from the center
        There is a reward of -20 for trying to move the box outside of the design domain
        There is an additional reward of 100 for getting the box in the middle
Rendering:
        The space occupied by the box are shown with an X
"""

def __init__(self):
    # Sets the initial state of the RL problem
    # Initialize all "self" variables
    self.num_rows = 10 # Must be en even number
    self.num_cols = 10 # Must be an even number
    self.num_states = self.num_rows * self.num_cols

    # Define the action and observation space
    # they must be gym.spaces objects
    self.action_space = spaces.Discrete(8)
    self.num_actions= self.action_space.n
    self.observation_space = spaces.Discrete(100)
    # Utilize the staterep variable to store the information needed for rendering
    self.staterep = []
    for i in range(self.num_rows):
        self.staterep += [[]]
        for j in range(self.num_cols):
            self.staterep[i] += ["|_"]
    for k in range(self.num_rows):
        self.staterep[k][self.num_cols-1] = "|_|"
    # Random starting value from which to build the square
    init_coord = [randint(0,self.num_rows-2), randint(0,self.num_cols-2)] # [row, col]
    Square = [[init_coord[0], init_coord[1]],
              [init_coord[0], init_coord[1]+1],
              [init_coord[0]+1, init_coord[1]],
              [init_coord[0]+1, init_coord[1]+1]] 
    for l in range(0,4):
        self.staterep[Square[l][0]][Square[l][1]] =  "|X"

    # Define the state (for Q evaluation)
    self.coord = init_coord
    self.state = [self.encode(self.coord[0], self.coord[1], self.num_rows)]

    self.done = False
    self.reward = 0
    #self.seed()

def encode(self, coord_row, coord_col, num_rows):
    # Convert the position of the square to a state number
    i = (coord_row * num_rows) + (coord_col + 1)
    i = float(i)
    return i

def step(self, action):
    num_rows = self.num_rows
    num_cols = self.num_cols
    # Step the environment by one time step. Returns observations, reward, done, info
    target = [(num_rows-2)/2,(num_cols-2)/2] # Center of the design domain
    self.target = np.array(self.encode(target[0], target[1], self.num_rows))
    flag = 0
    # Move the square based on the input action
    if action == 0:
        new_coord = [self.coord[0] + 1, self.coord[1]]
    elif action == 1:
        new_coord = [self.coord[0] - 1, self.coord[1]]
    elif action == 2:
        new_coord = [self.coord[0], self.coord[1] + 1]
    elif action == 3:
        new_coord = [self.coord[0], self.coord[1] - 1]
    elif action == 4:
        new_coord = [self.coord[0] - 1, self.coord[1] - 1]
    elif action == 5:
        new_coord = [self.coord[0] - 1, self.coord[1] + 1]
    elif action == 6:
        new_coord = [self.coord[0] + 1, self.coord[1] - 1]
    elif action == 7:
        new_coord = [self.coord[0] + 1, self.coord[1] + 1]

    # Update the position of the square
    new_Square = [[new_coord[0], new_coord[1]],
                  [new_coord[0], new_coord[1]+1],
                  [new_coord[0]+1, new_coord[1]],
                  [new_coord[0]+1, new_coord[1]+1]]
    # Update the state
    new_state = [self.encode(new_coord[0], new_coord[1], num_rows)]
    if new_Square[0][0] < 0 or new_Square[0][1] < 0 or new_Square[3][0] > num_rows-2 or new_Square [3][1] > num_cols-2:
        #print("Invalid Step") #If the box is moved beyond the boundaries
        # Move it in the opposite direction
        if action == 0:
            coord = [self.coord[0] - 1, self.coord[1]]
        if action == 1:
            coord = [self.coord[0] + 1, self.coord[1]]
        if action == 2:
            coord = [self.coord[0], self.coord[1] - 1]
        if action == 3:
            coord = [self.coord[0], self.coord[1] + 1]
        if action == 4:
            coord = [self.coord[0] + 1, self.coord[1] + 1]
        if action == 5:
            coord = [self.coord[0] + 1, self.coord[1] - 1]
        if action == 6:
            coord = [self.coord[0] - 1, self.coord[1] + 1]
        if action == 7:
            coord = [self.coord[0] - 1, self.coord[1] - 1]

        new_coord = coord
        new_Square = [[new_coord[0], new_coord[1]],
                      [new_coord[0], new_coord[1]+1],
                      [new_coord[0]+1, new_coord[1]],
                      [new_coord[0]+1, new_coord[1]+1]]

        flag = 1
        new_state = [self.encode(new_coord[0], new_coord[1], num_rows)]# CHECK IF THIS MAKES SENSE
    #    self.reward = -20 # Discourage from going into wall
    if new_coord[0] == target[0] and new_coord[1] == target[1]:
        self.reward = 100
        self.done = True
        #print('Success') 
    elif abs(target[0] - new_coord[0]) < abs(target[0] - self.coord[0]) and abs(target[1] - new_coord[1]) < abs(target[1] - self.coord[1]):
        self.reward = 30 # Reward if closer to target in the x and y
    elif abs(target[0] - new_coord[0]) < abs(target[0] - self.coord[0]) or abs(target[1] - new_coord[1]) < abs(target[1] - self.coord[1]):
        self.reward = 10 # Reward if closer to target in x or y
    else:
        self.reward = -10
    #else: 
    #    self.reward = -1
    lr = 0.1
    gamma = 0.8
    # Prepare for the next iteration
    self.coord = new_coord
    self.states = [self.state, new_state]
    self.state = new_state

    return  np.array(self.states), self.reward, self.done#, self.Q

def reset(self):
    for i in range(self.num_rows):
        for j in range(self.num_cols):
            self.staterep[i][j] = "|_"
    for k in range(self.num_rows):
        self.staterep[k][self.num_cols-1] = "|_|"
    num_states = self.num_cols*self.num_rows
    num_actions = self.num_actions
    init_coord= [randint(0,self.num_rows-2), randint(0,self.num_cols-2)]
    self.init_coord = init_coord
    self.coord = init_coord
    Square = [[init_coord[0], init_coord[1]],
              [init_coord[0], init_coord[1]+1],
              [init_coord[0]+1, init_coord[1]],
              [init_coord[0]+1, init_coord[1]+1]] 
    for l in range(0,4):
        self.staterep[Square[l][0]][Square[l][1]] =  "|X"
    self.state = [self.encode(self.coord[0], self.coord[1],self.num_rows)]
    self.done = False
    self.reward = 0
    return np.array(self.state)

def render(self, mode='human', close=False):
    # Renders one frame of the environment
    for i in range(self.num_rows):
        for j in range(self.num_cols):
            print(self.staterep[i][j], end = " ")
        print("")
...