Python 损耗值增加并转变为NaN后,Q表值也转变为NaN。有人能解释一下原因吗?
这是我的神经网络模型代码。Python 损耗值增加并转变为NaN后,Q表值也转变为NaN。有人能解释一下原因吗?,python,tensorflow,deep-learning,reinforcement-learning,q-learning,Python,Tensorflow,Deep Learning,Reinforcement Learning,Q Learning,这是我的神经网络模型代码。 self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.999 tf.reset_default_graph() self.sess = tf.InteractiveSession() self.X = tf.placeholder(tf.float32, [None, self.state_space]) self.Y = tf.placehol
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.999
tf.reset_default_graph()
self.sess = tf.InteractiveSession()
self.X = tf.placeholder(tf.float32, [None, self.state_space])
self.Y = tf.placeholder(tf.float32, [None, self.action_space])
layer1 = tf.layers.dense(self.X, 128, activation = tf.nn.leaky_relu)
layer2 = tf.layers.dense(layer1, 64, activation = tf.nn.leaky_relu)
layer3 = tf.layers.dense(layer2, 32, activation = tf.nn.leaky_relu)
self.outputs = tf.layers.dense(layer3, self.action_space, activation=tf.nn.softmax)
self.loss = tf.reduce_mean(tf.square(self.Y - self.outputs))
self.optimizer = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
self.sess.run(tf.global_variables_initializer())
X = np.empty((replay_size, self.state_space))
Y = np.empty((replay_size, self.action_space))
states = np.array([a[0][0] for a in mini_batch])
new_states = np.array([a[3][0] for a in mini_batch])
Q = self.sess.run(self.outputs, feed_dict = {self.X: states})
print("Q[0][0] = ", Q[0][0])
Q_new = self.sess.run(self.outputs, feed_dict = {self.X: new_states})
for i in range(len(mini_batch)):
state, action, reward, next_state, done = mini_batch[i]
target = Q[i]
target[action] = reward
if not done:
target[action] += self.gamma * np.amax(Q_new[i])
X[i] = state
Y[i] = target
loss, O = self.sess.run([self.loss, self.optimizer], feed_dict = {self.X: X, self.Y: Y})
这是Q值的代码片段。
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.999
tf.reset_default_graph()
self.sess = tf.InteractiveSession()
self.X = tf.placeholder(tf.float32, [None, self.state_space])
self.Y = tf.placeholder(tf.float32, [None, self.action_space])
layer1 = tf.layers.dense(self.X, 128, activation = tf.nn.leaky_relu)
layer2 = tf.layers.dense(layer1, 64, activation = tf.nn.leaky_relu)
layer3 = tf.layers.dense(layer2, 32, activation = tf.nn.leaky_relu)
self.outputs = tf.layers.dense(layer3, self.action_space, activation=tf.nn.softmax)
self.loss = tf.reduce_mean(tf.square(self.Y - self.outputs))
self.optimizer = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
self.sess.run(tf.global_variables_initializer())
X = np.empty((replay_size, self.state_space))
Y = np.empty((replay_size, self.action_space))
states = np.array([a[0][0] for a in mini_batch])
new_states = np.array([a[3][0] for a in mini_batch])
Q = self.sess.run(self.outputs, feed_dict = {self.X: states})
print("Q[0][0] = ", Q[0][0])
Q_new = self.sess.run(self.outputs, feed_dict = {self.X: new_states})
for i in range(len(mini_batch)):
state, action, reward, next_state, done = mini_batch[i]
target = Q[i]
target[action] = reward
if not done:
target[action] += self.gamma * np.amax(Q_new[i])
X[i] = state
Y[i] = target
loss, O = self.sess.run([self.loss, self.optimizer], feed_dict = {self.X: X, self.Y: Y})
我已经被这个问题困扰了一个星期了。任何帮助都将不胜感激