Я хочу провести оценку параметров с использованием подкрепления. В основном у меня есть обыкновенное дифференциальное уравнение (ODE) y '= acos (wt + d) * w . Фактическая функция ( y = asin (wt + d) ), поэтому я хочу оценить a , w и d . Я использовал scipy.integrate.odeint для решения этого ODE. В моей программе изначально a, w и d случайным образом генерируются в некотором диапазоне. После этого агент (алгоритм) предпримет какое-то действие (действие - это просто изменение значения a, w, d на фиксированное, увеличение или уменьшение, поэтому в основном агент может выполнить одно действие из 27 различных действий.). После того, как эта среда решит оду, используя odeint (), я вычисляю среднеквадратическую ошибку как потерю. Я считаю потерю отрицательной наградой. Если потеря равна нулю, то сделано. Государственные кортежи: [a, w, d]. Пожалуйста, скажите мне, я в правильном направлении? Что я должен делать дальше. У меня есть еще одно сомнение, что в машинном обучении после обучения у нас есть одна модель, и мы должны так же проверить ее. Как мы можем сделать это здесь, если это возможно. У меня есть два файла. Один - среда, другой - агент. среда
`import numpy as np
import math
from scipy.integrate import odeint
import random
class solver():
def get_experiment_data(self):
count = 0
for i in self.time:
self.y_true[count] = self.data_generator_sin(2, 2 * math.pi/12, math.pi/4, i)
count += 1
def data_generator_sin(self,a, w, d, t):
return a*math.sin(w*t + d)
def reset(self):
self.a = random.uniform(1.5,2.5)
self.w = random.uniform(0.25,0.75)
self.d = random.uniform(0.5, 0.8)
self.loss = 0.0
return [self.a, self.w, self.d]
def step1(self, action):
self.reward = 0
self.done = 0
#0 -> fixed, 1 -> increase, 2 -> decrease
# awd
# if action == 0:
# #000
# # self.a += self.a*0.1
# # self.w -= self.w*0.1
# # self.d -= self.d*0.1
if action == 1:
#001
# self.a += self.a*0.1
# self.w -= self.w*0.1
self.d += self.d*0.1
elif action == 2:
#002
# self.a += self.a*0.1
# self.w -= self.w*0.1
self.d -= self.d*0.1
elif action == 3:
#010
#self.a += self.a*0.1
self.w += self.w*0.1
#self.d += self.d*0.1
elif action == 4:
#011
#self.a += self.a*0.1
self.w += self.w*0.1
self.d += self.d*0.1
elif action == 5:
#012
#self.a += self.a*0.1
self.w += self.w*0.1
self.d -= self.d*0.1
elif action == 6:
#020
#self.a += self.a*0.1
self.w -= self.w*0.1
#self.d += self.d*0.1
elif action == 7:
#021
#self.a += self.a*0.1
self.w -= self.w*0.1
self.d += self.d*0.1
elif action == 8:
#022
#self.a += self.a*0.1
self.w -= self.w*0.1
self.d -= self.d*0.1
elif action == 9:
#100
self.a += self.a*0.1
#self.w -= self.w*0.1
#self.d += self.d*0.1
elif action == 10:
#101
self.a += self.a*0.1
#self.w -= self.w*0.1
self.d += self.d*0.1
elif action == 11:
#102
self.a += self.a*0.1
#self.w -= self.w*0.1
self.d -= self.d*0.1
elif action == 12:
#110
self.a += self.a*0.1
self.w += self.w*0.1
#self.d += self.d*0.1
elif action == 13:
#111
self.a += self.a*0.1
self.w += self.w*0.1
self.d += self.d*0.1
elif action == 14:
#112
self.a += self.a*0.1
self.w += self.w*0.1
self.d -= self.d*0.1
elif action == 15:
#120
self.a += self.a*0.1
self.w -= self.w*0.1
#self.d += self.d*0.1
elif action == 16:
#121
self.a += self.a*0.1
self.w -= self.w*0.1
self.d += self.d*0.1
elif action == 17:
#122
self.a += self.a*0.1
self.w -= self.w*0.1
self.d -= self.d*0.1
elif action == 18:
#200
self.a -= self.a*0.1
#self.w += self.w*0.1
#self.d += self.d*0.1
elif action == 19:
#201
self.a -= self.a*0.1
#self.w += self.w*0.1
self.d += self.d*0.1
elif action == 20:
#202
self.a -= self.a*0.1
#self.w += self.w*0.1
self.d -= self.d*0.1
elif action == 21:
#210
self.a -= self.a*0.1
self.w += self.w*0.1
#self.d += self.d*0.1
elif action == 22:
#211
self.a -= self.a*0.1
self.w += self.w*0.1
self.d += self.d*0.1
elif action == 23:
#212
self.a -= self.a*0.1
self.w += self.w*0.1
self.d -= self.d*0.1
elif action == 24:
#220
self.a -= self.a*0.1
self.w -= self.w*0.1
#self.d += self.d*0.1
elif action == 25:
#221
self.a -= self.a*0.1
self.w -= self.w*0.1
self.d += self.d*0.1
elif action == 26:
#222
self.a -= self.a*0.1
self.w -= self.w*0.1
self.d -= self.d*0.1
self.run_frame()
state = [self.a, self.w, self.d]
return self.reward, state, self.done
def dydx(y, t, *args):
a = args[0]
w = args[1]
d = args[2]
return a * math.cos(w * t + d) * w
def run_frame(self):
parameter = (self.a, self.w, self.d)
odeint_solver_data = odeint(self.dydx, 0, self.time, parameter).flatten()
# print(type(odeint_solver_data))
self.loss = np.square(np.subtract(odeint_solver_data,self.y_true)).mean()
self.reward = -self.loss
# if self.loss < 0.5:
# self.done = 1
def __init__(self):
self.reward = 0
self.done = False
self.a = random.uniform(1.5,2.5)
self.w = random.uniform(0.25,0.75)
self.d = random.uniform(0.5, 0.8)
self.loss = 0.0
self.start = 0
self.stop = 24
self.step = 0.2
#evenly spaced values within a given interval(0.1)
self.time = np.arange(self.start, self.stop, self.step)
self.y_true = np.zeros((len(self.time),), dtype=float)
self.get_experiment_data()
# print(self.a, self.w, self.d)
# print(sel)`
**agent**
`from environment import solver
import random
import numpy as np
from keras import Sequential
from collections import deque
from keras.layers import Dense
import matplotlib.pyplot as plt
from keras.optimizers import adam
env = solver()
np.random.seed(0)
class DQN:
""" Implementation of deep q learning algorithm """
def __init__(self, action_space, state_space):
self.action_space = action_space
self.state_space = state_space
self.epsilon = 1
self.gamma = .95
self.batch_size = 64
self.epsilon_min = .01
self.epsilon_decay = .995
self.learning_rate = 0.01
self.memory = deque(maxlen=100000)
self.model = self.build_model()
def build_model(self):
model = Sequential()
model.add(Dense(64, input_shape=(self.state_space,), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(self.action_space, activation='linear'))
model.compile(loss='mse', optimizer=adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
# print(state)
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_space)
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def replay(self):
if len(self.memory) < self.batch_size:
return
minibatch = random.sample(self.memory, self.batch_size)
states = np.array([i[0] for i in minibatch])
actions = np.array([i[1] for i in minibatch])
rewards = np.array([i[2] for i in minibatch])
next_states = np.array([i[3] for i in minibatch])
dones = np.array([i[4] for i in minibatch])
states = np.squeeze(states)
next_states = np.squeeze(next_states)
targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
targets_full = self.model.predict_on_batch(states)
ind = np.array([i for i in range(self.batch_size)])
targets_full[[ind], [actions]] = targets
self.model.fit(states, targets_full, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def train_dqn(episode):
loss = []
agent = DQN(27, 3)
for e in range(episode):
state = env.reset()
state = np.reshape(state, (1, 3))
# print(state)
score = 0
max_steps = 1000
for i in range(max_steps):
action = agent.act(state)
# print(type(action))
reward, next_state, done = env.step1(action)
score += reward
next_state = np.reshape(next_state, (1, 3))
agent.remember(state, action, reward, next_state, done)
state = next_state
agent.replay()
if done:
print("episode: {}/{}, score: {}".format(e, episode, score))
break
print("episode : {}/{}, state : {}, reward : {},".format(e, episode, state, reward))
loss.append(score)
return loss
if __name__ == '__main__':
ep = 100
loss = train_dqn(ep)
plt.plot([i for i in range(ep)], loss)
plt.xlabel('episodes')
plt.ylabel('reward')
plt.show()
`