Python 我的DDPG模型TF2得到了可怕的结果_Python_Tensorflow_Keras_Reinforcement Learning_Openai Gym

Python 我的DDPG模型TF2得到了可怕的结果

python tensorflow keras

Python 我的DDPG模型TF2得到了可怕的结果,python,tensorflow,keras,reinforcement-learning,openai-gym,Python,Tensorflow,Keras,Reinforcement Learning,Openai Gym,你好，我的DDPG模型，我已经在TF2中实现了，它在openai健身房的每个环境中都有可怕的结果，我需要帮助找到问题所在。我在我的GPU上运行这个。在《环境钟摆》中，我每集获得-1200/-1000奖励。这段代码来自我在udemy上的一门课程，但它是用TF1.x编写的，我在TF2中重写了它，但他的TF1.x实现有更好的结果。代码如下： import tensorflow as tf import numpy as np import os import gym import random imp

你好，我的DDPG模型，我已经在TF2中实现了，它在openai健身房的每个环境中都有可怕的结果，我需要帮助找到问题所在。我在我的GPU上运行这个。在《环境钟摆》中，我每集获得-1200/-1000奖励。这段代码来自我在udemy上的一门课程，但它是用TF1.x编写的，我在TF2中重写了它，但他的TF1.x实现有更好的结果。代码如下：

import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

class ReplayBuffer():
    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.reward_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.current = 0
        self.count = 0 
        self.size = size

    def add_experience(self, state, action, reward, next_state, done):
        self.obs1_buf[self.current] = state
        self.act_buf[self.current] = action
        self.reward_buf[self.current] = reward
        self.obs2_buf[self.current] = next_state
        self.done_buf[self.current] = done
        self.current = (self.current + 1) % self.size
        self.count = min(self.count+1, self.size)

    def sample_batch(self, batch_size=32):
        idx = np.random.randint(0, self.count, size=batch_size)
        return dict(s=self.obs1_buf[idx],
                    s2=self.obs2_buf[idx],
                    a=self.act_buf[idx],
                    r=self.reward_buf[idx],
                    d=self.done_buf[idx])

提前谢谢你

首先想到的是学习率：0.01太高了，即使对钟摆来说也是如此。尝试较低的学习率（例如演员为1e-3，评论家为5e-3）

在代码中还有两件事情值得注意：

参与者没有目标网络。为什么呢？IIRC ddpg拥有演员和评论家的目标网络
通常，最好使用相同的参数初始化主网络和目标网络。您可以使用
```
target\u model.set\u weights（model.get\u weights（））
```
在功能
```
play_one
```
中，训练步骤在播放一整集后完成。这可能是可以的，但并没有必要：因为钟摆不是实时的，所以你们不需要你们的代码变得很快，所以你们可以边玩边训练

如果你想看一看，我不久前在tensorflow 2中实现了ddpg。它解决了80集的钟摆问题。


class DDPG():
    def __init__(self, env, num_states, num_actions, action_max):

        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.action_max = action_max
        self.gamma = 0.99
        self.decay = 0.995
    
        self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
        self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

        def mu_model(hidden_layers):
            inp = Input(shape=(self.num_states, ))
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='tanh')(x)

            mu_model = Model(inp, x)
            
            return mu_model
        
        self.mu_model = mu_model([300, self.num_actions])

        def q_model(inp_state, inp_act, hidden_layers):
            inp_state = Input(shape=(inp_state, ))
            inp_mu = Input(shape=(inp_act, ))
            inp = concatenate([inp_state, inp_mu])
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='linear')(x)

            q_model = Model([inp_state, inp_mu], x)
            return q_model

        self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
        
        self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])

        #Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
                                                            #tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
        
        self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
                                                            tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])

    @tf.function
    def train_mu(self, state):
        with tf.GradientTape() as tape:
            actions = self.mu_model(state, training=True)
            critic_value = self.q_model([state, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
        self.mu_optimizer.apply_gradients(
            zip(actor_grad, self.mu_model.trainable_variables)
        )
        
    

    def q_minimize(self, state, action, next_state, reward, done):
        def calc_loss():
            q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
            q = self.q_model([state, action])
            cost = tf.reduce_mean((q - q_targ)**2)
            return cost
        self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)


    def train(self, state, action, reward, done, next_state):
        state = np.atleast_2d(state)
        next_state = np.atleast_2d(next_state)
        action = np.atleast_2d(action)
        reward = np.atleast_1d(reward)
        done = np.atleast_1d(done)

        self.update_target_net()
        self.train_mu(state)
        self.q_do_minimize(state, action, next_state, reward, done)


    def update_target_net(self):

        mu_weights = np.array(self.mu_model.get_weights())
        q_weights = np.array(self.q_model.get_weights())
        #print(mu_weights.shape)
        #print(q_weights.shape)
        
        mu_target_weights = np.array(self.mu_target_model.get_weights())
        q_target_weights = np.array(self.q_target_model.get_weights())
        
        
        
        self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)


    def get_action(self, states, noise=None):
        if noise is None: noise = self.ACT_NOISE_SCALE
        if len(states.shape) == 1: states = states.reshape(1,-1)
        action = self.mu_model.predict_on_batch(states)[0]
        if noise != 0:
            action += noise * np.random.randn(self.num_actions)
            action = np.clip(action, -self.action_max, self.action_max)
        return action


def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
    returns = []
    num_steps = 0

    for ep in range(num_train_ep):
        s, ep_return, ep_len, d = env.reset(), 0, 0, False

        while not (d or ep_len == max_episode_len):
            env.render()
            if num_steps > start_steps:
                a = agent.get_action(s, noise)
            else:
                a = env.action_space.sample()

            num_steps+=1
            if num_steps == start_steps:
                print("USING AGENT ACTIONS NOW")

            s2, r, d, _ = env.step(a)
            ep_return+=r
            ep_len+=1
            #print(s.shape)
            d = False if ep_len == max_episode_len else d

            replay_buffer.add_experience(s, a, r, s2, d)

            s = s2
        for _ in range(ep_len):
            batch = replay_buffer.sample_batch()
            state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']

            loss = agent.train(state, action, reward, done, next_state)

        returns.append(ep_return)
        print('Iter:', ep, 'Rewards:', ep_return)


    
    return returns
    
if __name__ == '__main__':
    
    env = gym.make('Pendulum-v0')
    obs_dim1 = env.observation_space.shape[0]
    
    act_dim1 = env.action_space.shape[0]
    
    action_max1 = env.action_space.high[0]
    actor = DDPG(env, obs_dim1, act_dim1, action_max1)
    replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)

    returns = play_one(env, actor, replay_buffer)