Python CartPole的深度Q分数保持在9分
因此,我使用一个使用tensorflow的deepQ实现来解决CartPole-v0,但是有时候输出(所有运行的40%)会停留在9。我尝试使用tf.set_random_seed修复种子,但这仍然不能确保输出不会被卡住。这是我的代码:Python CartPole的深度Q分数保持在9分,python,python-3.x,machine-learning,tensorflow,reinforcement-learning,Python,Python 3.x,Machine Learning,Tensorflow,Reinforcement Learning,因此,我使用一个使用tensorflow的deepQ实现来解决CartPole-v0,但是有时候输出(所有运行的40%)会停留在9。我尝试使用tf.set_random_seed修复种子,但这仍然不能确保输出不会被卡住。这是我的代码: from collections import deque import tensorflow as tf import numpy as np import random import gym import matplotlib.pyplot as plt im
from collections import deque
import tensorflow as tf
import numpy as np
import random
import gym
import matplotlib.pyplot as plt
import pickle
from time import time
t = int(time())
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen = 2000)
self.gamma = 0.95
#self.epsilon = 1.0
#self.epsilon_min = 0.01
#self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = self._build_model()
def _build_model(self):
graph = tf.Graph()
with graph.as_default():
inp = tf.placeholder(tf.float32, [None, self.state_size])
out = tf.placeholder(tf.float32, [None, self.action_size])
w1 = tf.Variable(tf.truncated_normal([self.state_size, 24]))
b1 = tf.Variable(tf.zeros([24]))
hidden = tf.nn.tanh(tf.matmul(inp, w1) + b1)
w2 = tf.Variable(tf.truncated_normal([24, 24]))
b2 = tf.Variable(tf.zeros([24]))
hidden1 = tf.nn.tanh(tf.matmul(hidden, w2) + b2)
w3 = tf.Variable(tf.truncated_normal([24, 24]))
b3 = tf.Variable(tf.zeros([24]))
hidden2 = tf.nn.tanh(tf.matmul(hidden1, w3) + b3)
wo = tf.Variable(tf.truncated_normal([24, self.action_size]))
bo = tf.Variable(tf.zeros([self.action_size]))
prediction = tf.matmul(hidden2, wo) + bo
loss = tf.losses.mean_squared_error(out, prediction)
train = tf.train.AdamOptimizer().minimize(loss)
init = tf.global_variables_initializer()
return graph, inp, out, prediction, train, init
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state, sess):
act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
return np.argmax(act_values[0])
def replay(self, batch_size, sess):
try:
minibatch = random.sample(self.memory, batch_size)
except ValueError:
minibatch = self.memory
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(sess.run(self.model[3], feed_dict = { self.model[1]: next_state}))
target_f = sess.run(self.model[3], feed_dict = { self.model[1]: state})
target_f[0][action] = target
#print(target_f)
sess.run(self.model[4], feed_dict = { self.model[1]: state, self.model[2]: target_f})
if __name__ == "__main__":
environment = 'CartPole-v0'
env = gym.make(environment)
avgs = deque(maxlen = 50)
rewardLA = []
agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)
sess = tf.Session(graph = agent.model[0])
sess.run(agent.model[5])
episodes = 10000
rewardL = []
for e in range(episodes):
state = env.reset()
state = np.reshape(state, [1, 4])
for time_t in range(500):
#env.render()
action = agent.act(state, sess)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, 4])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
break
avgs.append(time_t)
rewardLA.append(sum(avgs)/len(avgs))
print("episode: ", e, "score: ", time_t)
rewardL.append(time_t)
agent.replay(32, sess)
#pickle.dump(rewardL, open(environment + "_" + str(t) + "_rewardL.pickle", "wb"))
plt.plot(rewardLA)
plt.show()
我尝试将Optimizer更改为GD、rmsProp,但没有任何效果,但如果我简单地重新启动代码,效果会更好(在200个时代中达到199个)。为什么会这样?如何修复它。查看您的代码,我看不出环境是如何被探索的。你不需要像epsilon贪婪这样的东西来确保探索的进行吗?例如,我尝试如下修改
agent.act()
方法,似乎解决了问题
def act(self, state, sess, episode):
if random.random() < math.pow(2, -episode / 30):
return env.action_space.sample()
act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
return np.argmax(act_values[0])
def-act(自我、状态、sess、情节):
如果random.random()
玩转30,因为没有更好的术语,我称之为“探索常数”
无论如何,在我看来,如果没有像epsilon贪婪这样的东西(或者像上面那样随着时间衰减的东西),你就依赖于神经网络的输出来获得足够的熵来引起足够的探索。有时可能是这样;其他时候不是