Я применил тот же принцип к своему набору данных, но он не работал без ошибок, но без результатов
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
from numpy import random
import pandas as pd
import random
from sklearn.model_selection import train_test_split
#import excel file
df1 = pd.read_excel('new.xlsx')
df1 = pd.DataFrame(df1)
df1.head(8)
df1['WMRT'][0]
#discretization of walmart
wal = np.zeros(df1.shape[0])
for i in range(1,df1.shape[0],1):
if (df1['WMRT'][i] > df1['WMRT'][i-1] ):
wal[i]=1
else:
wal[i]=-1
df1['wal']=wal
print(df1['wal'])
#Discretization Dow Jones
djd = np.zeros(df1.shape[0])
for i in range(1,df1.shape[0],1):
if (df1['DJ'][i] > df1['DJ'][i-1] ):
djd[i]=1
else:
djd[i]=-1
df1['DDJ']=djd
print(df1['DDJ'])
#codification of the state based on the Dow Jonesindex and walmart values
state = np.zeros(df1.shape[0])
for i in range (1,df1.shape[0],1):
if (df1['DDJ'][i]== -1 and df1['wal'][i]== -1):
state[i]=1
elif(df1['DDJ'][i]== 1 and df1['wal'][i]== -1):
state[i]=2
elif(df1['DDJ'][i]== -1 and df1['wal'][i]== 1):
state[i]=3
else:
state[i]=4
df1['state']=state
print(df1['state'])
#training phase (i=4616)
#generate the same sample each time
random.seed(771)
#intialize randomly the actions for 100 observations
def number_update(action,p0,n):
N = np.zeros((n,8))
for i in range(p0,n,1):
N[i,:]=N[i-1,:]
k=int(2*(state[i]-1)+action[i])
N[i,k]=N[i,k]+1
return N
def Rhoo(action,p0,n):
RHO = np.zeros((n, 8))
r = np.zeros((n))
for i in range(p0,n,1):
RHO[i,:]=RHO[i-1,:]
k = int(2*(state[i]-1)+action[i])
if k in [0,2,4,6]:
r[i]=df1['WMRT'][i]/df1['WMRT'][i-1]-1
else:
r[i]=df1['DJ'][i]/df1['DJ'][i-1]-1
RHO[i,k] = RHO[i,k]+r[i]
return RHO
#Average reward for each action 0 and 1
def reward(N,RHO,p0,n):
R = np.zeros((n, 2))
for i in range (p0,n,1):
#j : even number
#k : odd number
k = 0
for j in range(0,7,2):
if N[i,j] != 0 :
R[i,0] += RHO[i,j]/N[i,j]
else :
R[i,0] = 0
k = j+1
if N[i,k] != 0 :
R[i,1] += RHO[i,k]/N[i,k]
else :
R[i,1] = 0
#print(R)
return R
def reward_max(action,p0,n):
r = np.zeros((n))
for i in range(p0,n,1):
k = int(2*(state[i]-1)+action[i])
if k in [0,2,4,6]:
r[i]=df1['WMRT'][i]/df1['WMRT'][i-1]-1
else:
r[i]=df1['DJ'][i]/df1['DJ'][i-1]-1
return r
#determine the action
def main():
action=np.zeros((df1.shape[0]))
train, test = train_test_split(df1, test_size=0.3)
#Training the model
#generate the same sample each time
random.seed(771)
# take 100 action randomly
for i in range(0,100,1):
action[i]=random.randint(0,1)
print(action)
print(type(action))
#N is the number of time the action 1 or 0 was taken from a state i
print("N")
N =number_update(action,1,100)
print(N)
#RHO represents the cumulative sum of the previous rewards obtained every time the action 0 or 1
print("RHO")
RHO = Rhoo(action,1,100)
print(RHO)
# represents the average reward for every action 0 or 1
print("R")
R = reward(N,RHO,1,100)
print(R)
for i in range(100,2265,1):
if(R[i-1,0]>R[i-1,1]):
action[i]=1
else:
action[i]=0
N = number_update(action,100,2265)
RHO = Rhoo(action,100,2265)
R = reward(N,RHO,100,2265)
print(action[100:2265])
print(R.shape)
main()