Machine learning 如何正确实现DQN算法
我试图实现DeepMind在本文中介绍的深度Q学习算法: 我用它来做一个学习打乒乓球的经纪人,但它似乎不起作用(即使经过2小时的训练,我也看不到任何改善)。这是代码Machine learning 如何正确实现DQN算法,machine-learning,deep-learning,artificial-intelligence,reinforcement-learning,openai-gym,Machine Learning,Deep Learning,Artificial Intelligence,Reinforcement Learning,Openai Gym,我试图实现DeepMind在本文中介绍的深度Q学习算法: 我用它来做一个学习打乒乓球的经纪人,但它似乎不起作用(即使经过2小时的训练,我也看不到任何改善)。这是代码 import gym import universe import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Activati
import gym
import universe
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Activation
from keras.models import load_model
import random
env = gym.make('gym-core.Pong-v0')
env.configure(remotes=1)
def num2str(number, obs):
number = np.argmax(number)
if number == 0:
action = [[('KeyEvent', 'ArrowRight', False), ('KeyEvent', 'ArrowLeft', True)] for ob in obs]
elif number == 1:
action = [[('KeyEvent', 'ArrowLeft', False), ('KeyEvent', 'ArrowRight', True)] for ob in obs]
return action
def preprocess(original_obs):
obs = original_obs
obs = np.array(obs)[0]['vision']
obs = np.delete(obs, np.s_[195:769], axis=0)
obs = np.delete(obs, np.s_[0:35], axis=0)
obs = np.delete(obs, np.s_[160:1025], axis=1)
obs = np.mean(obs, axis=2)
obs = obs[::2,::2]
obs = np.reshape(obs, (80, 80, 1))
return obs
model = Sequential()
model.add(Conv2D(32, kernel_size = (8, 8), strides = (4, 4), border_mode='same', activation='relu', init='uniform', input_shape = (80, 80, 4)))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Conv2D(64, kernel_size = (2, 2), strides = (2, 2)))
model.add(Conv2D(64, kernel_size = (3, 3), strides = (1, 1)))
model.add(Flatten())
model.add(Dense(256, init='uniform', activation='relu'))
model.add(Dense(2, init='uniform', activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
init_observe_time = 500
D = []
e = 1.0
e_threshold = 0.05
e_decay = 0.01
gamma = 0.99
batch_size = 15
frequency = 10
Q_values = np.array([0, 0])
obs = env.reset()
while True:
obs = env.step(num2str(np.array([random.randint(0, 1) for i in range(0, 2)]), obs))[0]
if obs != [None]:
break
x_t1 = preprocess(obs)
s_t1 = np.stack((x_t1, x_t1, x_t1, x_t1), axis = 2)
s_t1 = np.reshape(s_t1, (80, 80, 4))
t = 0
while True:
print("Time since last start: ", t)
a_t = np.zeros(2)
if random.random() < e:
a_index = random.randint(0, 1)
a_t[a_index] = 1
else:
Q_values = model.predict(np.array([s_t1]))[0]
a_index = np.argmax(Q_values)
a_t[a_index] = 1
print("Q Values: ", Q_values)
print("action taken: ", np.argmax(a_t))
print("epsilon: ", e)
if e > e_threshold:
e -= e_decay
obs, r_t, done, info = env.step(num2str(a_t, obs))
if obs == [None]:
continue
x_t2 = preprocess(obs)
print(x_t2.shape, s_t1[:,:,0:3].shape)
s_t2 = np.append(x_t2, s_t1[:,:,0:3], axis = 2)
D.append((s_t1, a_t, r_t, s_t2, done))
if t > init_observe_time and t%frequency == 0:
minibatch = random.sample(D, batch_size)
s1_batch = [i[0] for i in minibatch]
a_batch = [i[1] for i in minibatch]
r_batch = [i[2] for i in minibatch]
s2_batch = [i[3] for i in minibatch]
q_batch = model.predict(np.array(s2_batch))
y_batch = np.zeros((batch_size, 2))
y_batch = model.predict(np.array(s1_batch))
print("Q batch: ", q_batch)
print("y batch: ", y_batch)
for i in range(0, batch_size):
if (minibatch[i][4]):
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0]
else:
y_batch[i][np.argmax(a_batch[i])] = r_batch[i][0] + gamma * np.max(q_batch[i])
model.train_on_batch(np.array(s1_batch), y_batch)
s_t1 = s_t2
t += 1
env.render()
导入健身房
导入宇宙
将numpy作为np导入
进口干酪
从keras.models导入顺序
从keras.layers导入稠密、Conv2D、MaxPoolig2D、展平、激活
从keras.models导入负载_模型
随机输入
env=gym.make('gym-core.Pong-v0'))
环境配置(远程=1)
def num2str(编号,obs):
编号=np.argmax(编号)
如果数字==0:
obs中ob的操作=[('KeyEvent','ArrowRight',False),('KeyEvent','ArrowLeft',True)]]
elif编号==1:
obs中ob的操作=[('KeyEvent','ArrowLeft',False),('KeyEvent','ArrowRight',True)]]
返回动作
def预处理(原始obs):
obs=原始的_obs
obs=np.array(obs)[0]['vision']
obs=np.delete(obs,np.s_[195:769],轴=0)
obs=np.delete(obs,np.s[0:35],轴=0)
obs=np.delete(obs,np.s.[160:1025],轴=1)
obs=np.平均值(obs,轴=2)
obs=obs[::2,::2]
obs=np.整形(obs,(80,80,1))
返回obs
模型=顺序()
add(Conv2D(32,内核大小=(8,8),步幅=(4,4),边框模式=(相同),激活=(relu),初始化=(统一),输入形状=(80,80,4)))
add(MaxPooling2D(池大小=(2,2)))
add(Conv2D(64,内核大小=(2,2),步幅=(2,2)))
add(Conv2D(64,内核大小=(3,3),步幅=(1,1)))
model.add(展平())
添加(密集(256,init='uniform',activation='relu'))
添加(密集(2,init='uniform',activation='linear'))
compile(loss='mse',optimizer='adam',metrics=['accurity'])
初始观察时间=500
D=[]
e=1.0
e_阈值=0.05
e_衰减=0.01
伽马=0.99
批次大小=15
频率=10
Q_values=np.array([0,0])
obs=env.reset()
尽管如此:
obs=env.step(num2str(np.array([random.randint(0,1))表示范围(0,2)中的i),obs))[0]
如果obs!=[无]:
打破
x_t1=预处理(obs)
s_t1=np.stack((x_t1,x_t1,x_t1,x_t1),轴=2)
s_t1=np.重塑(s_t1,(80,80,4))
t=0
尽管如此:
打印(“自上次启动以来的时间:”,t)
a_t=np.零(2)
如果是random.random()e_阈值:
e-=e_衰变
obs,r_t,done,info=env.step(num2str(a_t,obs))
如果obs==[无]:
持续
x_t2=预处理(obs)
打印(x_t2.shape,s_t1[:,:,0:3]。shape)
附加(x_t2,s_t1[:,:,0:3],轴=2)
D.附加((s_t1,a_t,r_t,s_t2,完成))
如果t>初始观察时间和t%频率==0:
小批量=随机样本(D,批量大小)
s1_批次=[i[0]表示迷你批次中的i]
a_batch=[i[1]表示minibatch中的i]
r_batch=[i[2]表示小批量中的i]
s2_批次=[i[3]表示小批次中的i]
q_batch=model.predict(np.array(s2_batch))
y_批次=np.零((批次大小,2))
y_批次=模型预测(np.数组(s1_批次))
打印(“Q批:”,Q_批)
打印(“y批:,y_批)
对于范围内的i(0,批次大小):
如果(小批量[i][4]):
y_批次[i][np.argmax(a_批次[i])]=r_批次[i][0]
其他:
y_批次[i][np.argmax(a_批次[i])]=r_批次[i][0]+gamma*np.max(q_批次[i])
批量(np.数组(s1批量)、y批量)上的模型训练
s_t1=s_t2
t+=1
env.render()
有人对如何使其正常工作有任何建议吗?- 您的第二个和第三个
层似乎缺少其Conv2D
激活relu
- 你的
(或epsilon
)衰减太快了。仅经过95个时间步后,它将降至e
。我无法很快找到他们在2013年的论文中做了什么,但在2015年的论文中,他们将其从0.05
衰减到1
超过100万帧0.1
- 您的第二个和第三个
层似乎缺少其Conv2D
激活relu
- 你的
(或epsilon
)衰减太快了。仅经过95个时间步后,它将降至e
。我无法很快找到他们在2013年的论文中做了什么,但在2015年的论文中,他们将其从0.05
衰减到1
超过100万帧0.1