Я попробовал этот код Q-обучения в ноутбуке Jupyter. Это работало хорошо без ошибок. Я решил попробовать тот же код в aws, используя ноутбук Jupyter, загрузив данные из корзины S3. Я продолжаю получать ошибки (IndexError: допустимыми являются только целые числа, срезы (:
), многоточие (...
), numpy .newaxis (None
) и целые или логические массивы). Буду очень признателен, если кто-нибудь поможет мне решить эту проблему.
#Maps indices to locations
state_to_location = dict((state,location) for location, state in location_to_state.items())
# Initialize parameters
gamma = 0.9 # Discount factor
alpha = 0.3 # Learning rate
# Get the ending state corresponding to the ending location as given
def get_optimal_route(start_location,end_location):
# Copy the rewards matrix to new Matrix
rewards_new = np.copy(rewards)
# Get the ending state corresponding to the ending location as given
ending_state = location_to_state[end_location]
# With the above information automatically set the priority of
# the given ending state to the highest one
rewards_new["ending_state","ending_state"] = 999
# -----------Q-Learning algorithm-----------
# Initializing Q-Values
Q = np.array(np.zeros([11,11]))
print("Initialized Q matrix")
print(Q)
for i in range(10000):
# Pick up a state randomly
current_state = np.random.randint(0,11) # Python excludes the upper bound
# For traversing through the neighbor locations in the maze
playable_actions = []
# Iterate through the new rewards matrix and get the actions > 0
for j in range(11):
if rewards_new[current_state,j] > 0:
playable_actions.append(j)
# Pick an action randomly from the list of playable actions
# leading us to the next state
next_state = np.random.choice(playable_actions)
# Compute the temporal difference
# The action here exactly refers to going to the next state
TD = rewards_new[current_state,next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state,next_state]
# Update the Q-Value using the Bellman equation
Q[current_state,next_state] += alpha * TD
# Initialize the optimal route with the starting location
route = [start_location]
# We do not know about the next location yet, so initialize with the value of
# starting location
next_location = start_location
# We don't know about the exact number of iterations
# needed to reach to the final location hence while loop will be a good choice
# for iteratiing
while(next_location != end_location):
# Fetch the starting state
starting_state = location_to_state[start_location]
# Fetch the highest Q-value pertaining to starting state
next_state = np.argmax(Q[starting_state,])
# We got the index of the next state. But we need the corresponding letter.
next_location = state_to_location[next_state]
route.append(next_location)
# Update the starting location for the next iteration
start_location = next_location
print("Trained Q matrix")
#matrix = Q / np.max(Q) * 100
#print(matrix)
#print("Optimal route")
return route
print(get_optimal_route('L1','L11'))
IndexError Traceback (most recent call last)
<ipython-input-236-677dc76cecf2> in <module>()
----> 1 print(get_optimal_route('L1','L11'))
<ipython-input-235-59580ef2070d> in get_optimal_route(start_location, end_location)
10 # With the above information automatically set the priority of
11 # the given ending state to the highest one
---> 12 rewards_new["ending_state","ending_state"] = 999
13
14 # -----------Q-Learning algorithm-----------
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or
boolean arrays are valid indices