q-learning: ValueError: 'a' не может быть пустым, если не взяты образцы - PullRequest
0 голосов
/ 29 октября 2019

Я пытаюсь разработать алгоритм q-обучения для обучения с подкреплением, это мой код:

import numpy as np
R = np.matrix ([[-1, 0, -1, -1, 0, -1, -1, -1, -1], 
            [-1, -1, 100, 0, -1, -1, -1, -1, -1], 
            [-1, -1, 100, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1], 
            [-1, -1, -1, -1, -1, 100, 0, -1, -1], 
            [-1, -1, -1, -1, -1, 100, -1, -1, -1], 
            [-1, -1, -1, -1, -1, -1, -1, 100, 0], 
            [-1, -1, -1, -1, -1, -1, -1, 100, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1]])
    # Q matrix
Q = np.matrix(np.zeros([9,9]))

# Gamma (learning parameter)
gamma = 0.4

# Initial state. (Usually to be chosen at random)
initial_state = 1

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0) [1]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state)

# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

#sample next action to be performed
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q matrix
update(initial_state, action, gamma)

# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)

# Testing 

# Goal state = 2

current_state = 1
steps = [current_state]

while current_state != 2:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)

    steps.append(next_step_index)
    current_state = next_step_index

# Print selected sequence of steps
print("Selected path:")
print(steps)

, но у меня всегда есть эта ошибка, которую я не понимал:

ValueError Traceback (последний вызов был последним) в 46 current_state = np.random.randint (0, int (Q.shape [0])) 47 available_act = available_actions (current_state) ---> 48 action = sample_next_action (available_act) 49 update(current_state, action, gamma) 50

в sample_next_action (available_actions_range) 19 # Эта функция случайным образом выбирает, какое действие следует выполнить в диапазоне всех доступных действий. 20 def sample_next_action (available_actions_range): ---> 21 next_action = int (np.random.choice (available_act, 1)) 22 return next_action 23

mtrand.pyx в mtrand.RandomState.choice ()

ValueError: 'a' не может быть пустым, если не взяты образцы

любая помощь, пожалуйста!

1 Ответ

0 голосов
/ 29 октября 2019

Есть несколько недостатков с кодом:

  1. Измените структуру данных R и Q на:

    R = np.array ... Q = np.zeros ([9, 9])

  2. Измените матрицу R для состояния 3 и состояния 8, чтобы было доступно хотя бы одно действие. Поэтому просто добавьте в эти строки одно значение больше нуля.

  3. Измените определение available_actions на:

    def available_actions(state):
         current_state_row = R[state, :]
         av_act = np.where(current_state_row >= 0)[0]
    
  4. Измените строку 39 для правильнойиндексирование

    max_index = np.where (Q [действие,] == np.max (Q [действие,:])) [0]

  5. Изменить строку 73для правильного индексирования

    next_step_index = np.where (Q [current_state ,:] == np.max (Q [current_state,:])) [0]

С этими изменениями вы сможете получить значение.
Конечный результат будет:

Выбранный путь: [1, 2]

import numpy as np
R = np.array([[-1, 0, -1, -1, 0, -1, -1, -1, -1],
            [-1, -1, 100, 0, -1, -1, -1, -1, -1],
            [-1, -1, 100, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, 0, -1, -1, -1],
            [-1, -1, -1, -1, -1, 100, 0, -1, -1],
            [-1, -1, -1, -1, -1, 100, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, 100, 0],
            [-1, -1, -1, -1, -1, -1, -1, 100, -1],
            [-1, -1, -1, -1, 0, -1, -1, -1, -1]])
    # Q matrix
Q = np.zeros([9,9])

# Gamma (learning parameter)
gamma = 0.4

# Initial state. (Usually to be chosen at random)
initial_state = 1

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state, :]
    av_act = np.where(current_state_row >= 0)[0]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state)

# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

#sample next action to be performed
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action, :]))[0]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q matrix
update(initial_state, action, gamma)

# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)

# Testing

# Goal state = 2

current_state = 1
steps = [current_state]

while current_state != 2:
    next_step_index = np.where(Q[current_state,:] == np.max(Q[current_state,:]))[0]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)

    steps.append(next_step_index)
    current_state = next_step_index

# Print selected sequence of steps
print("Selected path:")
print(steps)
...