Я пытаюсь разработать алгоритм q-обучения для обучения с подкреплением, это мой код:
import numpy as np
R = np.matrix ([[-1, 0, -1, -1, 0, -1, -1, -1, -1],
[-1, -1, 100, 0, -1, -1, -1, -1, -1],
[-1, -1, 100, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 100, 0, -1, -1],
[-1, -1, -1, -1, -1, 100, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, 100, 0],
[-1, -1, -1, -1, -1, -1, -1, 100, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1]])
# Q matrix
Q = np.matrix(np.zeros([9,9]))
# Gamma (learning parameter)
gamma = 0.4
# Initial state. (Usually to be chosen at random)
initial_state = 1
# This function returns all available actions in the state given as an argument
def available_actions(state):
current_state_row = R[state,]
av_act = np.where(current_state_row >= 0) [1]
return av_act
# Get available actions in the current state
available_act = available_actions(initial_state)
# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act, 1))
return next_action
#sample next action to be performed
action = sample_next_action(available_act)
# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size=1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
# Q learning formula
Q[current_state, action] = R[current_state, action] + gamma * max_value
# Update Q matrix
update(initial_state, action, gamma)
# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state, action, gamma)
# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)
# Testing
# Goal state = 2
current_state = 1
steps = [current_state]
while current_state != 2:
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
# Print selected sequence of steps
print("Selected path:")
print(steps)
, но у меня всегда есть эта ошибка, которую я не понимал:
ValueError Traceback (последний вызов был последним) в 46 current_state = np.random.randint (0, int (Q.shape [0])) 47 available_act = available_actions (current_state) ---> 48 action = sample_next_action (available_act) 49 update(current_state, action, gamma) 50
в sample_next_action (available_actions_range) 19 # Эта функция случайным образом выбирает, какое действие следует выполнить в диапазоне всех доступных действий. 20 def sample_next_action (available_actions_range): ---> 21 next_action = int (np.random.choice (available_act, 1)) 22 return next_action 23
mtrand.pyx в mtrand.RandomState.choice ()
ValueError: 'a' не может быть пустым, если не взяты образцы
любая помощь, пожалуйста!