Python 我的DDPG模型TF2得到了可怕的结果


import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

class ReplayBuffer():
    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
        self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.reward_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.current = 0
        self.count = 0 
        self.size = size

    def add_experience(self, state, action, reward, next_state, done):
        self.obs1_buf[self.current] = state
        self.act_buf[self.current] = action
        self.reward_buf[self.current] = reward
        self.obs2_buf[self.current] = next_state
        self.done_buf[self.current] = done
        self.current = (self.current + 1) % self.size
        self.count = min(self.count+1, self.size)

    def sample_batch(self, batch_size=32):
        idx = np.random.randint(0, self.count, size=batch_size)
        return dict(s=self.obs1_buf[idx],




  • 参与者没有目标网络。为什么呢?IIRC ddpg拥有演员和评论家的目标网络
  • 通常,最好使用相同的参数初始化主网络和目标网络。您可以使用
    target\u model.set\u weights(model.get\u weights())
  • 在功能

如果你想看一看,我不久前在tensorflow 2中实现了ddpg。它解决了80集的钟摆问题。

class DDPG():
    def __init__(self, env, num_states, num_actions, action_max):

        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.action_max = action_max
        self.gamma = 0.99
        self.decay = 0.995
        self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
        self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

        def mu_model(hidden_layers):
            inp = Input(shape=(self.num_states, ))
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='tanh')(x)

            mu_model = Model(inp, x)
            return mu_model
        self.mu_model = mu_model([300, self.num_actions])

        def q_model(inp_state, inp_act, hidden_layers):
            inp_state = Input(shape=(inp_state, ))
            inp_mu = Input(shape=(inp_act, ))
            inp = concatenate([inp_state, inp_mu])
            x = inp

            for layers in hidden_layers[:-1]:
                x = Dense(layers, activation='relu')(x)
            x = Dense(hidden_layers[-1], activation='linear')(x)

            q_model = Model([inp_state, inp_mu], x)
            return q_model

        self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
        self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])

        #Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
                                                            #tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
        self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
                                                            tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
                                                            tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
                                                            tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])

    def train_mu(self, state):
        with tf.GradientTape() as tape:
            actions = self.mu_model(state, training=True)
            critic_value = self.q_model([state, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
            zip(actor_grad, self.mu_model.trainable_variables)

    def q_minimize(self, state, action, next_state, reward, done):
        def calc_loss():
            q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
            q = self.q_model([state, action])
            cost = tf.reduce_mean((q - q_targ)**2)
            return cost
        self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)

    def train(self, state, action, reward, done, next_state):
        state = np.atleast_2d(state)
        next_state = np.atleast_2d(next_state)
        action = np.atleast_2d(action)
        reward = np.atleast_1d(reward)
        done = np.atleast_1d(done)

        self.q_do_minimize(state, action, next_state, reward, done)

    def update_target_net(self):

        mu_weights = np.array(self.mu_model.get_weights())
        q_weights = np.array(self.q_model.get_weights())
        mu_target_weights = np.array(self.mu_target_model.get_weights())
        q_target_weights = np.array(self.q_target_model.get_weights())
        self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)

    def get_action(self, states, noise=None):
        if noise is None: noise = self.ACT_NOISE_SCALE
        if len(states.shape) == 1: states = states.reshape(1,-1)
        action = self.mu_model.predict_on_batch(states)[0]
        if noise != 0:
            action += noise * np.random.randn(self.num_actions)
            action = np.clip(action, -self.action_max, self.action_max)
        return action

def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
    returns = []
    num_steps = 0

    for ep in range(num_train_ep):
        s, ep_return, ep_len, d = env.reset(), 0, 0, False

        while not (d or ep_len == max_episode_len):
            if num_steps > start_steps:
                a = agent.get_action(s, noise)
                a = env.action_space.sample()

            if num_steps == start_steps:
                print("USING AGENT ACTIONS NOW")

            s2, r, d, _ = env.step(a)
            d = False if ep_len == max_episode_len else d

            replay_buffer.add_experience(s, a, r, s2, d)

            s = s2
        for _ in range(ep_len):
            batch = replay_buffer.sample_batch()
            state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']

            loss = agent.train(state, action, reward, done, next_state)

        print('Iter:', ep, 'Rewards:', ep_return)

    return returns
if __name__ == '__main__':
    env = gym.make('Pendulum-v0')
    obs_dim1 = env.observation_space.shape[0]
    act_dim1 = env.action_space.shape[0]
    action_max1 = env.action_space.high[0]
    actor = DDPG(env, obs_dim1, act_dim1, action_max1)
    replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)

    returns = play_one(env, actor, replay_buffer)