Tensorflow 使用TFAgent的自定义环境
我正在尝试使用TFAgents包学习自定义环境。我正在跟踪ML手册()的操作。我的目标是在自定义编写的网格世界环境中使用DQN代理 网格世界环境:Tensorflow 使用TFAgent的自定义环境,tensorflow,keras,reinforcement-learning,Tensorflow,Keras,Reinforcement Learning,我正在尝试使用TFAgents包学习自定义环境。我正在跟踪ML手册()的操作。我的目标是在自定义编写的网格世界环境中使用DQN代理 网格世界环境: class MyEnvironment(tf_agents.environments.py_environment.PyEnvironment): def __init__(self, discount=1.0): super().__init__() self.discount = discount self._acti
class MyEnvironment(tf_agents.environments.py_environment.PyEnvironment):
def __init__(self, discount=1.0):
super().__init__()
self.discount = discount
self._action_spec = tf_agents.specs.BoundedArraySpec(shape=(), dtype=np.int32, name="action", minimum=0, maximum=3)
self._observation_spec = tf_agents.specs.BoundedArraySpec(shape=(4, 4), dtype=np.int32, name="observation", minimum=0, maximum=1)
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = np.zeros(2, dtype=np.int32)
obs = np.zeros((4, 4), dtype=np.int32)
obs[self._state[0], self._state[1]] = 1
return tf_agents.trajectories.time_step.restart(obs)
def _step(self, action):
self._state += [(-1, 0), (+1, 0), (0, -1), (0, +1)][action]
reward = 0
obs = np.zeros((4, 4), dtype=np.int32)
done = (self._state.min() < 0 or self._state.max() > 3)
if not done:
obs[self._state[0], self._state[1]] = 1
if done or np.all(self._state == np.array([3, 3])):
reward = -1 if done else +10
return tf_agents.trajectories.time_step.termination(obs, reward)
else:
return tf_agents.trajectories.time_step.transition(obs, reward, self.discount)
最后,DQN代理是
train_step = tf.Variable(0)
update_period = 4 # train the model every 4 steps
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0, decay_steps=250000 // update_period, end_learning_rate=0.01)
agent = DqnAgent(tf_env.time_step_spec(),
tf_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
target_update_period=2000, # <=> 32,000 ALE frames
td_errors_loss_fn=keras.losses.Huber(reduction="none"),
gamma=0.99, # discount factor
train_step_counter=train_step,
epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()
致:
然而,这导致:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-53-ce737b2b13fd> in <module>()
21
22
---> 23 agent = DqnAgent(tf_env.time_step_spec(),
24 tf_env.action_spec(),
25 q_network=q_net,
1 frames
/usr/local/lib/python3.6/dist-packages/tf_agents/environments/py_environment.py in time_step_spec(self)
147 the step_type, reward, discount, and observation structure.
148 """
--> 149 return ts.time_step_spec(self.observation_spec(), self.reward_spec())
150
151 def current_time_step(self) -> ts.TimeStep:
/usr/local/lib/python3.6/dist-packages/tf_agents/trajectories/time_step.py in time_step_spec(observation_spec, reward_spec)
388 'Expected observation and reward specs to both be either tensor or '
389 'array specs, but saw spec values {} vs. {}'
--> 390 .format(first_observation_spec, first_reward_spec))
391 if isinstance(first_observation_spec, tf.TypeSpec):
392 return TimeStep(
TypeError: Expected observation and reward specs to both be either tensor or array specs, but saw spec values BoundedTensorSpec(shape=(4, 4), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(1, dtype=int32)) vs. ArraySpec(shape=(), dtype=dtype('float32'), name='reward')
但仍然导致了相同的错误。无论如何,我都可以解决这个问题:您不能将
TensorSpec
用于PyEnvironment
类对象,这就是您尝试的解决方案不起作用的原因。一个简单的修复方法应该是使用原始代码
self._action_spec = tf_agents.specs.BoundedArraySpec(shape=(), dtype=np.int32, name="action", minimum=0, maximum=3)
self._observation_spec = tf_agents.specs.BoundedArraySpec(shape=(4, 4), dtype=np.int32, name="observation", minimum=0, maximum=1)
然后像这样包装您的环境:
env= MyEnvironment()
tf_env = tf_agents.environments.tf_py_environment.TFPyEnvironment(env)
这是最简单的事情。或者,您可以将您的环境定义为
TFEnvironment
类对象,使用TensorSpec
并将所有环境代码更改为在tensors上操作。我不建议初学者这样做…非常感谢。我刚想出来,就来这里发了!但是你能给我一个关于home yo直接使用TFEnvironment类的链接吗。@kosa你可以在tf代理文档中找到它:
self._action_spec = tf_agents.specs.BoundedTensorSpec(
shape=(), dtype=np.int32, name="action", minimum=0, maximum=3)
self._observation_spec = tf_agents.specs.BoundedTensorSpec(
shape=(4, 4), dtype=np.int32, name="observation", minimum=0, maximum=1)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-53-ce737b2b13fd> in <module>()
21
22
---> 23 agent = DqnAgent(tf_env.time_step_spec(),
24 tf_env.action_spec(),
25 q_network=q_net,
1 frames
/usr/local/lib/python3.6/dist-packages/tf_agents/environments/py_environment.py in time_step_spec(self)
147 the step_type, reward, discount, and observation structure.
148 """
--> 149 return ts.time_step_spec(self.observation_spec(), self.reward_spec())
150
151 def current_time_step(self) -> ts.TimeStep:
/usr/local/lib/python3.6/dist-packages/tf_agents/trajectories/time_step.py in time_step_spec(observation_spec, reward_spec)
388 'Expected observation and reward specs to both be either tensor or '
389 'array specs, but saw spec values {} vs. {}'
--> 390 .format(first_observation_spec, first_reward_spec))
391 if isinstance(first_observation_spec, tf.TypeSpec):
392 return TimeStep(
TypeError: Expected observation and reward specs to both be either tensor or array specs, but saw spec values BoundedTensorSpec(shape=(4, 4), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(1, dtype=int32)) vs. ArraySpec(shape=(), dtype=dtype('float32'), name='reward')
self._reward_spec = tf_agents.specs.TensorSpec((1,), np.dtype('float32'), 'reward')
self._action_spec = tf_agents.specs.BoundedArraySpec(shape=(), dtype=np.int32, name="action", minimum=0, maximum=3)
self._observation_spec = tf_agents.specs.BoundedArraySpec(shape=(4, 4), dtype=np.int32, name="observation", minimum=0, maximum=1)
env= MyEnvironment()
tf_env = tf_agents.environments.tf_py_environment.TFPyEnvironment(env)