Python tf_特工没有正确训练？_Python_Tensorflow Agents

Python tf_特工没有正确训练？

python

Python tf_特工没有正确训练？,python,tensorflow-agents,Python,Tensorflow Agents,这段代码应该使用TF代理库在Cartpole环境中训练DQN（Deep Q Networks）代理，但该代理似乎没有正确训练。我正在尝试使用驱动程序模块编写一个简单的示例我还能够运行TF代理库中的示例 from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from tf_agent

这段代码应该使用TF代理库在Cartpole环境中训练DQN（Deep Q Networks）代理，但该代理似乎没有正确训练。我正在尝试使用驱动程序模块编写一个简单的示例

我还能够运行TF代理库中的示例

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils

tf.compat.v1.enable_v2_behavior()


# parameter
env_name = 'CartPole-v0'
num_iterations = 20000

collect_steps_per_iteration = 1
initial_steps = 1000

replay_buffer_capacity = 100000
batch_size = 64
learning_rate = 0.001
fc_layer_params = (50, )


# load enviroment
py_train_env = suite_gym.load(env_name)
py_eval_env = suite_gym.load(env_name)
tf_train_env = tf_py_environment.TFPyEnvironment(py_train_env)
tf_eval_env = tf_py_environment.TFPyEnvironment(py_eval_env)


# create agent
q_net = q_network.QNetwork(tf_train_env.observation_spec(), tf_train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
tf_agent = dqn_agent.DqnAgent(tf_train_env.time_step_spec(), tf_train_env.action_spec(), q_network=q_net,
                              optimizer=optimizer)
tf_agent.initialize()


# replay Buffer,policies and driver
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
                                                               batch_size=tf_train_env.batch_size,
                                                               max_length=replay_buffer_capacity)

random_policy = random_tf_policy.RandomTFPolicy(tf_train_env.time_step_spec(), tf_train_env.action_spec())
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy

init_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, random_policy, [replay_buffer.add_batch],
                                                    initial_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, collect_policy, [replay_buffer.add_batch],
                                                       collect_steps_per_iteration)


# collect init data
init_driver.run()
ds = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(ds)


# train agent
print('Train Agent(global steps=' + str(num_iterations*collect_steps_per_iteration) + '):')
tf_train_env.reset()
for i in range(num_iterations):
    collect_driver.run()
    experience, _ = next(iterator)
    train_loss = tf_agent.train(experience)

    # evaluate all 100 steps
    if ((i+1) * collect_steps_per_iteration) % 100 == 0:
        metric = [tf_metrics.AverageReturnMetric()]
        result = metric_utils.eager_compute(metric, tf_eval_env, eval_policy, num_episodes=5)
        print('step = {0}: loss = {1}: AR = {2}'.format((i+1) * collect_steps_per_iteration, train_loss.loss,
                                                        result['AverageReturn'].numpy()))

代码正在运行，但代理在训练后无法玩游戏。此外，我预计平均回报率会随着时间的推移而增加，但它将保持不变。

：）试试看

#重播缓冲区、策略和驱动程序

replay\u buffer=tf\u uniform\u replay\u buffer.tfuniformermreplaybuffer（

重播缓冲区\u观察者=重播缓冲区。添加批处理

from tf_agents.metrics import tf_metrics
train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

动态步骤驱动程序。动态步骤驱动程序（…观察者=[回放缓冲区观察者+训练度量试试看
#重播缓冲区、策略和驱动程序
replay\u buffer=tf\u uniform\u replay\u buffer.tfuniformermreplaybuffer（
重播缓冲区\u观察者=重播缓冲区。添加批处理

from tf_agents.metrics import tf_metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ]
动态步进驱动程序。动态步进驱动程序（…观察者=[回放缓冲区\u观察者+训练度量