Python DQN Atari与tensorflow:训练似乎陷入困境
我正在尝试学习一个DQ学习网络,在Tensorflow中玩突破阿塔里。代码运行没有问题,但总是在1000-1200集之后,执行一个步骤的时间会激增到100秒以上。 这是我的DQN:Python DQN Atari与tensorflow:训练似乎陷入困境,python,tensorflow,deep-learning,reinforcement-learning,Python,Tensorflow,Deep Learning,Reinforcement Learning,我正在尝试学习一个DQ学习网络,在Tensorflow中玩突破阿塔里。代码运行没有问题,但总是在1000-1200集之后,执行一个步骤的时间会激增到100秒以上。 这是我的DQN: class DQNetwork(): def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001): self.scope = scope
class DQNetwork():
def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001):
self.scope = scope
with tf.variable_scope(self.scope):
# ---------------------
# Basic Deep Q-Network
# ---------------------
self.x = tf.placeholder(tf.float32, shape=[None, *state_size], name="inputs")
# Input is 84x84x4
self.conv1 = tf.layers.conv2d(inputs = self.x,
filters = 32,
kernel_size = [8,8],
strides = [4,4],
padding = "VALID",
name = "conv1",
activation="relu")
self.conv2 = tf.layers.conv2d(inputs = self.conv1,
filters = 64,
kernel_size = [4,4],
strides = [2,2],
padding = "VALID",
name = "conv2",
activation="relu")
self.conv3 = tf.layers.conv2d(inputs = self.conv2,
filters = 64,
kernel_size = [3,3],
strides = [1,1],
padding = "VALID",
name = "conv3",
activation="relu")
self.flatten = tf.layers.flatten(self.conv3)
self.fc = tf.layers.dense(inputs = self.flatten,
units = 512,
activation = tf.nn.relu,
name="fc1")
self.logits = tf.layers.dense(inputs = self.fc,
units = num_outputs,
activation=None)
self.best_action = tf.argmax(self.logits, name="best_action", axis=1)
self.max_q = tf.reduce_max(self.logits, name="max_q", axis=1)
if scope == 'Target':
self.rewards = tf.placeholder(tf.float32, shape=None, name="rewards")
self.gamma = tf.constant(gamma, name="Gamma")
self.done = tf.placeholder(tf.int32, shape=None, name="done_values")
self.td_target = self.rewards + (self.gamma*self.max_q) * tf.cast( tf.abs(self.done -1 ), tf.float32)
if scope == 'Q':
self.target_placeholder = tf.placeholder(tf.float32, shape=None, name="target_placeholder_q")
self.actions = tf.placeholder(tf.uint8, shape=None, name="AllActions")
self.actions_onehot = tf.one_hot(self.actions, depth=num_outputs, name="One_Hot")
self.Q = tf.reduce_sum(tf.multiply(self.actions_onehot, self.logits))
self.huber_loss = huber_loss(self.target_placeholder-self.Q)
self.loss = tf.reduce_mean(self.huber_loss)
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, epsilon=0.01)
self.train = self.optimizer.minimize(self.loss, name="minimize")
def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta)
)
Huber损耗函数:
class DQNetwork():
def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001):
self.scope = scope
with tf.variable_scope(self.scope):
# ---------------------
# Basic Deep Q-Network
# ---------------------
self.x = tf.placeholder(tf.float32, shape=[None, *state_size], name="inputs")
# Input is 84x84x4
self.conv1 = tf.layers.conv2d(inputs = self.x,
filters = 32,
kernel_size = [8,8],
strides = [4,4],
padding = "VALID",
name = "conv1",
activation="relu")
self.conv2 = tf.layers.conv2d(inputs = self.conv1,
filters = 64,
kernel_size = [4,4],
strides = [2,2],
padding = "VALID",
name = "conv2",
activation="relu")
self.conv3 = tf.layers.conv2d(inputs = self.conv2,
filters = 64,
kernel_size = [3,3],
strides = [1,1],
padding = "VALID",
name = "conv3",
activation="relu")
self.flatten = tf.layers.flatten(self.conv3)
self.fc = tf.layers.dense(inputs = self.flatten,
units = 512,
activation = tf.nn.relu,
name="fc1")
self.logits = tf.layers.dense(inputs = self.fc,
units = num_outputs,
activation=None)
self.best_action = tf.argmax(self.logits, name="best_action", axis=1)
self.max_q = tf.reduce_max(self.logits, name="max_q", axis=1)
if scope == 'Target':
self.rewards = tf.placeholder(tf.float32, shape=None, name="rewards")
self.gamma = tf.constant(gamma, name="Gamma")
self.done = tf.placeholder(tf.int32, shape=None, name="done_values")
self.td_target = self.rewards + (self.gamma*self.max_q) * tf.cast( tf.abs(self.done -1 ), tf.float32)
if scope == 'Q':
self.target_placeholder = tf.placeholder(tf.float32, shape=None, name="target_placeholder_q")
self.actions = tf.placeholder(tf.uint8, shape=None, name="AllActions")
self.actions_onehot = tf.one_hot(self.actions, depth=num_outputs, name="One_Hot")
self.Q = tf.reduce_sum(tf.multiply(self.actions_onehot, self.logits))
self.huber_loss = huber_loss(self.target_placeholder-self.Q)
self.loss = tf.reduce_mean(self.huber_loss)
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, epsilon=0.01)
self.train = self.optimizer.minimize(self.loss, name="minimize")
def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta)
)
重播缓冲区
class ReplayBuffer():
def __init__(self, buffer_size):
self.buffer = deque([], maxlen=buffer_size)
def add(self, new_state):
if len(new_state) != 5:
raise Exception("States must have: state, action, reward, next_state, done")
self.buffer.append(new_state)
def sample(self, batch_size):
return r.sample(self.buffer, batch_size)
更新目标网络
def get_update_target_ops(Q_network, Target_network):
# You code comes here
# 1. get the trainable variables per network
Q_trainable = tf.trainable_variables(scope=Q_network.scope)
Target_trainable = tf.trainable_variables(scope=Target_network.scope)
# 2. sort them with sorted(list, key=attrgetter())
Q_trainable = sorted(Q_trainable, key=attrgetter("name"))
Target_trainable = sorted(Target_trainable, key=attrgetter("name"))
# 3.create a new list with all assign ops
update_target_expr = []
for q_var, t_var in zip(Q_trainable, Target_trainable):
update_target_expr.append(t_var.assign(q_var))
return update_target_expr
贪婪行为
def choose_egreedy_action(session, epsilon, network, state):
state = np.float32(state / 255.0)
if np.random.rand() <= epsilon:
return np.random.randint(0, n)
else:
state_reshaped = np.reshape(state, (1, *obs_space))
best_action = session.run(network.best_action, feed_dict={network.x:state_reshaped})[0]
return best_action
超参数
def preprocess_frame(obs):
processed_observe = np.uint8(
resize(rgb2gray(obs), (84, 84), mode='constant') * 255)
return processed_observe
EPISODES = 50000
epsilon = 1.
epsilon_start, epsilon_end = 1.0, 0.1
exploration_steps = 1000000.
epsilon_decay_step = (epsilon_start - epsilon_end) / exploration_steps
batch_size = 32
train_start = 50000
update_target_rate = 10000
gamma = 0.99
buffer_size = 400000
no_op_steps = 30
global_steps = 10000000
tf.reset_default_graph()
Q_network = DQNetwork(scope="Q", state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
T_network = DQNetwork(scope="Target",state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
update_target_network = get_update_target_ops(Q_network,T_network)
初始化网络
def preprocess_frame(obs):
processed_observe = np.uint8(
resize(rgb2gray(obs), (84, 84), mode='constant') * 255)
return processed_observe
EPISODES = 50000
epsilon = 1.
epsilon_start, epsilon_end = 1.0, 0.1
exploration_steps = 1000000.
epsilon_decay_step = (epsilon_start - epsilon_end) / exploration_steps
batch_size = 32
train_start = 50000
update_target_rate = 10000
gamma = 0.99
buffer_size = 400000
no_op_steps = 30
global_steps = 10000000
tf.reset_default_graph()
Q_network = DQNetwork(scope="Q", state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
T_network = DQNetwork(scope="Target",state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
update_target_network = get_update_target_ops(Q_network,T_network)
列车环路
def train(sess, Q, Target, buffer, batch_size):
# You code comes here
# 1. Sample from the replay buffer
mini_batches = buffer.sample(batch_size)
observations, actions, rewards, next_observations, done, = map(list, zip(*mini_batches))
observations = np.array(observations)/255.
next_observations =np.array(next_observations)/255.
td_targets = sess.run( Target.td_target, feed_dict={Target.x : next_observations, Target.rewards:rewards, Target.done:done})
max_q, loss, _ = sess.run([Q.max_q, Q.loss, Q.train], feed_dict={Q.x : observations, Q.target_placeholder:td_targets, Q.actions:actions})
return loss
from tqdm import trange
import random
game = gym.make('BreakoutDeterministic-v4')
scores, episodes = [], [],
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
buffer = ReplayBuffer(buffer_size)
sess.run(tf.global_variables_initializer())
epsilon_schedule = LinearSchedule(epsilon_start, epsilon_end, train_start, epsilon_decay_step)
done = False
dead = False
# 1 episode = 5 lives
step, score, start_life = 0, 0, 5
observe = game.reset()
eps = 0
# this is one of DeepMind's idea.
# just do nothing at the start of episode to avoid sub-optimal
for _ in range(random.randint(1, no_op_steps)):
observe, _, _, _ = game.step(1)
# At start of episode, there is no preceding frame
# So just copy initial states to make history
state = preprocess_frame(observe)
history = np.stack((state, state, state, state), axis=2)
history = np.reshape([history], (84, 84, 4))
loss = 0
for global_step in trange(global_steps):
#global_step += 1
step += 1
# get action for the current history and go one step in environment
action = choose_egreedy_action(sess, epsilon, Q_network, history)
observe, reward, done, info = game.step(action)
# pre-process the observation --> history
next_state = preprocess_frame(observe)
next_state = np.reshape([next_state], (84, 84, 1))
#print(next_state.shape)
next_history = np.append(next_state, history[ :, :, :3], axis=2)
# if the agent missed ball, agent is dead --> episode is not over
if start_life > info['ale.lives']:
dead = True
start_life = info['ale.lives']
#if dead:
#reward = -1
#score += reward
reward = np.clip(reward, -1., 1.)
# save the sample <s, a, r, s'> to the replay memory
buffer.add([history, action, reward, next_history, dead])
epsilon = epsilon_schedule.value(global_step)
if global_step > train_start:
train(sess, Q_network, T_network, buffer, batch_size)
# update the target model with model
if global_step % update_target_rate == 0:
#print("update networks")
sess.run(update_target_network)
score += reward
# if agent is dead, then reset the history
if dead:
dead = False
else:
history = next_history
# if done, plot the score over episodes
if done:
if eps%100 == 0:
print("episode:", eps, " score:", score, " global_step: ", global_step,
" epsilon: ", epsilon)
scores.append(score)
episodes.append(step)
done = False
dead = False
# 1 episode = 5 lives
step, score, start_life = 0, 0, 5
observe = game.reset()
eps += 1
# this is one of DeepMind's idea.
# just do nothing at the start of episode to avoid sub-optimal
for _ in range(random.randint(1, no_op_steps)):
observe, _, _, _ = game.step(1)
# At start of episode, there is no preceding frame
# So just copy initial states to make history
state = preprocess_frame(observe)
history = np.stack((state, state, state, state), axis=2)
history = np.reshape([history], (84, 84, 4))
if global_step % 5000 == 0:
saver.save(sess, f'models/breakout/model_breakout.ckpt')
正如您所看到的,每次迭代的持续时间都会大大增加,因此培训似乎被卡住了。
我不知道这是我的GPU还是代码的问题
你知道怎么做吗?我在openAI gym的Atari Breakout环境中使用RL算法和训练时也有同样的经历。这是在我的勘探率降到一个很低的值之后发生的。我从这篇文章中找到了解决方案: 我知道我的问题与那篇文章类似,因为当我为每一集渲染游戏画面时,我看到在Atari突破中失去生命后,球消失了(它被暂停) 在我开始训练时它没有暂停的原因是因为探索率很高,暂停时它会采取随机行动,这会导致它选择一个行动开始游戏 ================= 我认为这可能会解决您的问题,但不确定是否会(在RL中通常不建议这样做) 见此帖: 因此,上面帖子中的问题提到,当代理失去生命时,算法将奖励设置为-1。它通过使用系统信息(调用.step()时给出)来检测生命何时丢失来实现这一点。我在想,当你在Atari Breakout中发现一条生命消失时,你必须硬编码代理必须选择开始游戏的动作(下一步)
同样,在RL中不建议使用这种方法,因为代理在决定采取哪种操作时只应使用观察结果。sry my fault。改变它;)任何想法都是有用的!:)