Python DQN Atari与tensorflow：训练似乎陷入困境_Python_Tensorflow_Deep Learning_Reinforcement Learning

Python DQN Atari与tensorflow：训练似乎陷入困境

python tensorflow deep-learning

Python DQN Atari与tensorflow：训练似乎陷入困境,python,tensorflow,deep-learning,reinforcement-learning,Python,Tensorflow,Deep Learning,Reinforcement Learning,我正在尝试学习一个DQ学习网络，在Tensorflow中玩突破阿塔里。代码运行没有问题，但总是在1000-1200集之后，执行一个步骤的时间会激增到100秒以上。这是我的DQN： class DQNetwork(): def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001): self.scope = scope

我正在尝试学习一个DQ学习网络，在Tensorflow中玩突破阿塔里。代码运行没有问题，但总是在1000-1200集之后，执行一个步骤的时间会激增到100秒以上。这是我的DQN：

class DQNetwork():
    def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001):
        
        self.scope = scope
        

        with tf.variable_scope(self.scope):

            # ---------------------
            # Basic Deep Q-Network
            # ---------------------
            self.x = tf.placeholder(tf.float32, shape=[None, *state_size], name="inputs")
            
            # Input is 84x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.x,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                         name = "conv1",
                                         activation="relu")
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                 name = "conv2",
                                activation="relu")

            self.conv3 = tf.layers.conv2d(inputs = self.conv2,
                                 filters = 64,
                                 kernel_size = [3,3],
                                 strides = [1,1],
                                 padding = "VALID",
                                 name = "conv3",
                                activation="relu")
            
            self.flatten = tf.layers.flatten(self.conv3)
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.relu,
                                name="fc1")
            
            self.logits = tf.layers.dense(inputs = self.fc,
                                          units = num_outputs, 
                                        activation=None)
            

            self.best_action = tf.argmax(self.logits, name="best_action", axis=1)
            self.max_q = tf.reduce_max(self.logits, name="max_q", axis=1)
               
            if scope == 'Target':
                self.rewards = tf.placeholder(tf.float32, shape=None, name="rewards")    
                self.gamma = tf.constant(gamma, name="Gamma")
                self.done = tf.placeholder(tf.int32, shape=None, name="done_values")

               
                self.td_target = self.rewards + (self.gamma*self.max_q) * tf.cast( tf.abs(self.done -1 ), tf.float32)
              
                
            if scope == 'Q':
                self.target_placeholder = tf.placeholder(tf.float32, shape=None, name="target_placeholder_q")  
                
                self.actions = tf.placeholder(tf.uint8, shape=None, name="AllActions")
                self.actions_onehot = tf.one_hot(self.actions, depth=num_outputs, name="One_Hot")  
                self.Q = tf.reduce_sum(tf.multiply(self.actions_onehot, self.logits))
                
                self.huber_loss = huber_loss(self.target_placeholder-self.Q)
                self.loss = tf.reduce_mean(self.huber_loss)
                
                self.optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, epsilon=0.01)
                self.train = self.optimizer.minimize(self.loss, name="minimize")

def huber_loss(x, delta=1.0):
    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
    return tf.where(
        tf.abs(x) < delta,
        tf.square(x) * 0.5,
        delta * (tf.abs(x) - 0.5 * delta)
    )

Huber损耗函数：

class DQNetwork():
    def __init__(self, scope, state_size=(84, 84, 4), num_outputs=4, gamma=0.9, learning_rate=0.001):
        
        self.scope = scope
        

        with tf.variable_scope(self.scope):

            # ---------------------
            # Basic Deep Q-Network
            # ---------------------
            self.x = tf.placeholder(tf.float32, shape=[None, *state_size], name="inputs")
            
            # Input is 84x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.x,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                         name = "conv1",
                                         activation="relu")
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                 name = "conv2",
                                activation="relu")

            self.conv3 = tf.layers.conv2d(inputs = self.conv2,
                                 filters = 64,
                                 kernel_size = [3,3],
                                 strides = [1,1],
                                 padding = "VALID",
                                 name = "conv3",
                                activation="relu")
            
            self.flatten = tf.layers.flatten(self.conv3)
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.relu,
                                name="fc1")
            
            self.logits = tf.layers.dense(inputs = self.fc,
                                          units = num_outputs, 
                                        activation=None)
            

            self.best_action = tf.argmax(self.logits, name="best_action", axis=1)
            self.max_q = tf.reduce_max(self.logits, name="max_q", axis=1)
               
            if scope == 'Target':
                self.rewards = tf.placeholder(tf.float32, shape=None, name="rewards")    
                self.gamma = tf.constant(gamma, name="Gamma")
                self.done = tf.placeholder(tf.int32, shape=None, name="done_values")

               
                self.td_target = self.rewards + (self.gamma*self.max_q) * tf.cast( tf.abs(self.done -1 ), tf.float32)
              
                
            if scope == 'Q':
                self.target_placeholder = tf.placeholder(tf.float32, shape=None, name="target_placeholder_q")  
                
                self.actions = tf.placeholder(tf.uint8, shape=None, name="AllActions")
                self.actions_onehot = tf.one_hot(self.actions, depth=num_outputs, name="One_Hot")  
                self.Q = tf.reduce_sum(tf.multiply(self.actions_onehot, self.logits))
                
                self.huber_loss = huber_loss(self.target_placeholder-self.Q)
                self.loss = tf.reduce_mean(self.huber_loss)
                
                self.optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, epsilon=0.01)
                self.train = self.optimizer.minimize(self.loss, name="minimize")

def huber_loss(x, delta=1.0):
    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
    return tf.where(
        tf.abs(x) < delta,
        tf.square(x) * 0.5,
        delta * (tf.abs(x) - 0.5 * delta)
    )

重播缓冲区

class ReplayBuffer():
    def __init__(self, buffer_size):
        self.buffer = deque([], maxlen=buffer_size)

    def add(self, new_state):
        if len(new_state) != 5:
            raise Exception("States must have: state, action, reward, next_state, done")
        self.buffer.append(new_state)

    def sample(self, batch_size):
        return r.sample(self.buffer, batch_size)

更新目标网络

def get_update_target_ops(Q_network, Target_network):

    # You code comes here     
    # 1. get the trainable variables per network
    Q_trainable = tf.trainable_variables(scope=Q_network.scope)
    Target_trainable = tf.trainable_variables(scope=Target_network.scope)
    
    # 2. sort them with sorted(list, key=attrgetter())
    Q_trainable  = sorted(Q_trainable, key=attrgetter("name"))
    Target_trainable  = sorted(Target_trainable, key=attrgetter("name"))

    
    # 3.create a new list with all assign ops
    update_target_expr = []
    for q_var, t_var in zip(Q_trainable, Target_trainable):
        update_target_expr.append(t_var.assign(q_var))

    return update_target_expr

贪婪行为

def choose_egreedy_action(session, epsilon, network, state):
    state = np.float32(state / 255.0)
    
    if np.random.rand() <= epsilon:
        return np.random.randint(0, n)
    else:
        state_reshaped = np.reshape(state, (1, *obs_space))
        best_action = session.run(network.best_action, feed_dict={network.x:state_reshaped})[0]
        return best_action

超参数

def preprocess_frame(obs):
    processed_observe = np.uint8(
        resize(rgb2gray(obs), (84, 84), mode='constant') * 255)
    return processed_observe

EPISODES = 50000
epsilon = 1.
epsilon_start, epsilon_end = 1.0, 0.1
exploration_steps = 1000000.
epsilon_decay_step = (epsilon_start - epsilon_end) / exploration_steps

batch_size = 32
train_start = 50000
update_target_rate = 10000
gamma = 0.99
buffer_size = 400000
no_op_steps = 30

global_steps = 10000000

tf.reset_default_graph()

Q_network  = DQNetwork(scope="Q", state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
T_network = DQNetwork(scope="Target",state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
update_target_network = get_update_target_ops(Q_network,T_network)

初始化网络

def preprocess_frame(obs):
    processed_observe = np.uint8(
        resize(rgb2gray(obs), (84, 84), mode='constant') * 255)
    return processed_observe

EPISODES = 50000
epsilon = 1.
epsilon_start, epsilon_end = 1.0, 0.1
exploration_steps = 1000000.
epsilon_decay_step = (epsilon_start - epsilon_end) / exploration_steps

batch_size = 32
train_start = 50000
update_target_rate = 10000
gamma = 0.99
buffer_size = 400000
no_op_steps = 30

global_steps = 10000000

tf.reset_default_graph()

Q_network  = DQNetwork(scope="Q", state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
T_network = DQNetwork(scope="Target",state_size=(84, 84, 4), num_outputs=4, gamma=gamma, learning_rate=0.00025)
update_target_network = get_update_target_ops(Q_network,T_network)

列车环路

def train(sess, Q, Target, buffer, batch_size):

    # You code comes here     
    # 1. Sample from the replay buffer
    mini_batches = buffer.sample(batch_size)
    
    
    
    observations, actions, rewards, next_observations, done,  = map(list, zip(*mini_batches))
    
    observations = np.array(observations)/255.
    next_observations =np.array(next_observations)/255.
    td_targets = sess.run( Target.td_target, feed_dict={Target.x : next_observations, Target.rewards:rewards, Target.done:done})

    max_q, loss, _ = sess.run([Q.max_q, Q.loss, Q.train], feed_dict={Q.x : observations, Q.target_placeholder:td_targets, Q.actions:actions})
    
    return loss

from tqdm import trange
import random
game = gym.make('BreakoutDeterministic-v4')
scores, episodes = [], [], 

with tf.Session() as sess:
    
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        buffer = ReplayBuffer(buffer_size)


        sess.run(tf.global_variables_initializer())

        epsilon_schedule = LinearSchedule(epsilon_start, epsilon_end, train_start, epsilon_decay_step)
        
        done = False
        dead = False
        # 1 episode = 5 lives
        step, score, start_life = 0, 0, 5
        observe = game.reset()
        eps = 0
        # this is one of DeepMind's idea.
        # just do nothing at the start of episode to avoid sub-optimal
        for _ in range(random.randint(1, no_op_steps)):
            observe, _, _, _ = game.step(1)
            
        # At start of episode, there is no preceding frame
        # So just copy initial states to make history
        state = preprocess_frame(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (84, 84, 4))    
        
        loss = 0
        
        for global_step in trange(global_steps):
            #global_step += 1
            step += 1

            # get action for the current history and go one step in environment
            action = choose_egreedy_action(sess, epsilon, Q_network, history)

            observe, reward, done, info = game.step(action)


            # pre-process the observation --> history
            next_state = preprocess_frame(observe)
            next_state = np.reshape([next_state], (84, 84, 1))
            #print(next_state.shape)
            next_history = np.append(next_state, history[ :, :, :3], axis=2)


            # if the agent missed ball, agent is dead --> episode is not over
            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            #if dead:
                #reward = -1
                
            #score += reward
            
            reward = np.clip(reward, -1., 1.)

            # save the sample <s, a, r, s'> to the replay memory
            buffer.add([history, action, reward, next_history, dead])


            epsilon = epsilon_schedule.value(global_step)


            if global_step > train_start:
                train(sess, Q_network, T_network, buffer, batch_size)

            # update the target model with model
            if global_step % update_target_rate == 0:
                #print("update networks")
                sess.run(update_target_network) 

            score += reward

            # if agent is dead, then reset the history
            if dead:
                dead = False
            else:
                history = next_history

            # if done, plot the score over episodes
            if done:
                if eps%100 == 0:  
                    print("episode:", eps, "  score:", score, " global_step: ", global_step,
                         " epsilon: ", epsilon)
                scores.append(score)
                episodes.append(step)
                
                done = False
                dead = False
                # 1 episode = 5 lives
                step, score, start_life = 0, 0, 5
                observe = game.reset()
                eps +=  1
                # this is one of DeepMind's idea.
                # just do nothing at the start of episode to avoid sub-optimal
                for _ in range(random.randint(1, no_op_steps)):
                    observe, _, _, _ = game.step(1)

                # At start of episode, there is no preceding frame
                # So just copy initial states to make history
                state = preprocess_frame(observe)
                history = np.stack((state, state, state, state), axis=2)
                history = np.reshape([history], (84, 84, 4))   
                
                
                

            if global_step % 5000 == 0:
                saver.save(sess, f'models/breakout/model_breakout.ckpt')

正如您所看到的，每次迭代的持续时间都会大大增加，因此培训似乎被卡住了。我不知道这是我的GPU还是代码的问题

你知道怎么做吗？

我在openAI gym的Atari Breakout环境中使用RL算法和训练时也有同样的经历。这是在我的勘探率降到一个很低的值之后发生的。我从这篇文章中找到了解决方案：

我知道我的问题与那篇文章类似，因为当我为每一集渲染游戏画面时，我看到在Atari突破中失去生命后，球消失了（它被暂停）

在我开始训练时它没有暂停的原因是因为探索率很高，暂停时它会采取随机行动，这会导致它选择一个行动开始游戏

=================

我认为这可能会解决您的问题，但不确定是否会（在RL中通常不建议这样做）

见此帖：

因此，上面帖子中的问题提到，当代理失去生命时，算法将奖励设置为-1。它通过使用系统信息（调用.step（）时给出）来检测生命何时丢失来实现这一点。我在想，当你在Atari Breakout中发现一条生命消失时，你必须硬编码代理必须选择开始游戏的动作（下一步）

同样，在RL中不建议使用这种方法，因为代理在决定采取哪种操作时只应使用观察结果。

sry my fault。改变它；）任何想法都是有用的！：）