Python 3.x 深度神经网络在训练时不更新权重
我目前正在学习tensorflow,刚刚开始掌握它的图形概念。现在,我尝试使用梯度下降(Adam optimizer)实现一个NN来解决cartpole环境。我首先随机初始化我的体重,然后在训练期间采取随机行动(考虑现有体重)。测试时,我总是以最大的概率采取行动。然而,我的分数总是在10左右,方差在0.8左右。总是。它并没有以一种显著的方式改变,看起来它总是在每一步都采取纯粹的随机行动,根本不学习任何东西。正如我所说,权重似乎从未正确更新过。我需要在哪里以及如何做到这一点 这是我的密码:Python 3.x 深度神经网络在训练时不更新权重,python-3.x,tensorflow,neural-network,openai-gym,Python 3.x,Tensorflow,Neural Network,Openai Gym,我目前正在学习tensorflow,刚刚开始掌握它的图形概念。现在,我尝试使用梯度下降(Adam optimizer)实现一个NN来解决cartpole环境。我首先随机初始化我的体重,然后在训练期间采取随机行动(考虑现有体重)。测试时,我总是以最大的概率采取行动。然而,我的分数总是在10左右,方差在0.8左右。总是。它并没有以一种显著的方式改变,看起来它总是在每一步都采取纯粹的随机行动,根本不学习任何东西。正如我所说,权重似乎从未正确更新过。我需要在哪里以及如何做到这一点 这是我的密码: imp
import tensorflow as tf
import numpy as np
from gym.envs.classic_control import CartPoleEnv
env = CartPoleEnv()
learning_rate = 10**(-3)
gamma = 0.9999
n_train_trials = 10**3
n_test_trials = 10**2
n_actions = env.action_space.n
n_obs = env.observation_space.high.__len__()
goal_steps = 200
should_render = False
print_per_episode = 100
state_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_obs), name='symbolic_state')
actions_one_hot_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_actions),
name='symbolic_actions_one_hot_holder')
discounted_rewards_holder = tf.placeholder(dtype=tf.float32, shape=None, name='symbolic_reward')
# initialize neurons list dynamically
def get_neurons_list():
i = n_obs
n_neurons_list = [i]
while i < (n_obs * n_actions) // (n_actions // 2):
i *= 2
n_neurons_list.append(i)
while i // 2 > n_actions:
i = i // 2
n_neurons_list.append(i)
n_neurons_list.append(n_actions)
# print(n_neurons_list)
return n_neurons_list
with tf.name_scope('nonlinear_policy'):
# create list of layers with sizes
n_neurons_list = get_neurons_list()
network = None
for i in range((len(n_neurons_list) - 1)):
theta = tf.Variable(tf.random_normal([n_neurons_list[i], n_neurons_list[i+1]]))
bias = tf.Variable(tf.random_normal([n_neurons_list[i+1]]))
if network is None:
network = tf.matmul(state_holder, theta) + bias
else:
network = tf.matmul(network, theta) + bias
if i < len(n_neurons_list) - 1:
network = tf.nn.relu(network)
action_probabilities = tf.nn.softmax(network)
testing_action_choice = tf.argmax(action_probabilities, dimension=1, name='testing_action_choice')
with tf.name_scope('loss'):
actually_chosen_probability = action_probabilities * actions_one_hot_holder
L_theta = -1 * (tf.reduce_sum(tf.log(actually_chosen_probability)) * tf.reduce_sum(discounted_rewards_holder))
with tf.name_scope('train'):
# We define the optimizer to use the ADAM optimizer, and ask it to minimize our loss
gd_opt = tf.train.AdamOptimizer(learning_rate).minimize(L_theta)
sess = tf.Session() # FOR NOW everything is symbolic, this object has to be called to compute each value of Q
# Start
sess.run(tf.global_variables_initializer())
observation = env.reset()
batch_rewards = []
states = []
action_one_hots = []
episode_rewards = []
episode_rewards_list = []
episode_steps_list = []
step = 0
episode_no = 0
while episode_no <= n_train_trials:
if should_render: env.render()
step += 1
action_probability_values = sess.run(action_probabilities,
feed_dict={state_holder: [observation]})
# Choose the action using the action probabilities output by the policy implemented in tensorflow.
action = np.random.choice(np.arange(n_actions), p=action_probability_values.ravel())
# Calculating the one-hot action array for use by tensorflow
action_arr = np.zeros(n_actions)
action_arr[action] = 1.
action_one_hots.append(action_arr)
# Record states
states.append(observation)
observation, reward, done, info = env.step(action)
# We don't want to go above 200 steps
if step >= goal_steps:
done = True
batch_rewards.append(reward)
episode_rewards.append(reward)
# If the episode is done, and it contained at least one step, do the gradient updates
if len(batch_rewards) > 0 and done:
# First calculate the discounted rewards for each step
batch_reward_length = len(batch_rewards)
discounted_batch_rewards = batch_rewards.copy()
for i in range(batch_reward_length):
discounted_batch_rewards[i] *= (gamma ** (batch_reward_length - i - 1))
# Next run the gradient descent step
# Note that each of action_one_hots, states, discounted_batch_rewards has the first dimension as the length
# of the current trajectory
gradients = sess.run(gd_opt, feed_dict={actions_one_hot_holder: action_one_hots, state_holder: states,
discounted_rewards_holder: discounted_batch_rewards})
action_one_hots = []
states = []
batch_rewards = []
if done:
# Done with episode. Reset stuff.
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
observation = env.reset()
episode_rewards_list = []
episode_rewards = []
episode_steps_list = []
step = 0
episode_no = 0
print("Testing")
while episode_no <= n_test_trials:
env.render()
step += 1
# For testing, we choose the action using an argmax.
test_action, = sess.run([testing_action_choice],
feed_dict={state_holder: [observation]})
observation, reward, done, info = env.step(test_action[0])
if step >= 200:
done = True
episode_rewards.append(reward)
if done:
episode_no += 1
episode_rewards_list.append(np.sum(episode_rewards))
episode_steps_list.append(step)
episode_rewards = []
step = 0
observation = env.reset()
if episode_no % print_per_episode == 0:
print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
)
将tensorflow导入为tf
将numpy作为np导入
从gym.envs.classic_控件导入CartPoleEnv
env=CartPoleEnv()
学习率=10**(-3)
伽马=0.9999
n\u列车试验=10**3
n\u试验\u试验=10**2
n\u actions=env.action\u space.n
n_obs=环境观测空间高
目标_步数=200
应该呈现=假吗
每集打印\u=100
state\u holder=tf.placeholder(dtype=tf.float32,shape=(无,n\u obs),name='symbolic\u state')
操作\u one\u hot\u holder=tf.placeholder(dtype=tf.float32,shape=(无,n个操作),
name='symbolic\u actions\u one\u hot\u holder')
折扣奖励持有者=tf.placeholder(dtype=tf.float32,shape=None,name='symbolic\u奖励')
#动态初始化神经元列表
def get_NU list():
i=n_obs
n__列表=[i]
而我<(n_obs*n_actions)/(n_actions//2):
i*=2
n__列表。附加(i)
当i//2>n_操作时:
i=i//2
n__列表。附加(i)
n\u列表。追加(n\u操作)
#打印(n\u列表)
返回n\u列表
使用tf.name_scope(“非线性_策略”):
#创建具有大小的图层列表
n_neurons_list=get_neurons_list()
网络=无
对于范围内的i((len(n_列表)-1)):
θ=tf.变量(tf.随机正常([n_神经元列表[i],n_神经元列表[i+1]))
偏倚=tf.变量(tf.随机正常([n\u神经元\u列表[i+1]]))
如果网络为无:
网络=tf.matmul(状态保持器,θ)+偏差
其他:
网络=tf.matmul(网络,θ)+偏差
如果i 而第二集则是一个tensorflow程序的例子,它使用Q Learning来学习CartPole开放式健身房
它能够快速学会直立行走80步
代码如下:
输入数学
将numpy作为np导入
导入系统
随机输入
sys.path.append(“../gym”)
从gym.envs.classic_控件导入CartPoleEnv
env=CartPoleEnv()
以下是输出:
总结
我无法用一个简单的神经网络进行Q学习,从而能够解决CartPole问题,但可以用不同的神经网络大小和深度进行有趣的实验
希望你喜欢这段代码,
干杯这似乎是一个架构问题@jaaq,我想提出一个代码解决方案。为了确保我的解决方案符合您的要求,您能先回答我几个问题吗?你是从哪里或者怎样得出L_θ的方程式的。或者,您使用优化器优化的是什么?最后,你能接受一个小的架构更改(不更改完全连接的层)吗?当然,我不介意更改架构。L_theta代表损失,我使用了我在脚本中找到的公式:“Tensorflow中策略梯度的损失函数是:总和(对数概率)*总和(折扣的奖励总和)”我(希望)计算奖励并将其取反,告诉ADAM最小化(最大化我得到的分数)你发现错误了吗@wontonimoI已经下载了代码并在本地运行。我注意到您的代码没有将上一个操作作为提要,将其添加到提要作为NN的输入将有助于b/c的重要状态信息。我在试图查找您的错误时添加了它。此外,您已将gamma设置为接近1,根据,这是不稳定的。您的NN足够大,可以解决问题,因此它似乎是您的Q学习实现。我花了两个小时都没有成功。抱歉。查看您的代码,我实际上看不到Q-学习等式,即Q[状态][动作]+=学习率x(增量奖励+折扣x最大值-Q[状态][动作])。看起来你的代码做了Q[s][action]+=learning_rate x(奖励x折扣),这不是等式。
discount = 0.5
learning_rate = 0.5
gradient = .001
regularizaiton_factor = .1
import tensorflow as tf
tf_state = tf.placeholder( dtype=tf.float32 , shape=[4] )
tf_state_2d = tf.reshape( tf_state , [1,4] )
tf_action = tf.placeholder( dtype=tf.int32 )
tf_action_1hot = tf.reshape( tf.one_hot( tf_action , 2 ) , [1,2] )
tf_delta_reward = tf.placeholder( dtype=tf.float32 )
tf_value = tf.placeholder( dtype=tf.float32 )
tf_matrix1 = tf.Variable( tf.random_uniform([4,7], -.001, .001) )
tf_matrix2 = tf.Variable( tf.random_uniform([7,2], -.001, .001) )
tf_logits = tf.matmul( tf_state_2d , tf_matrix1 )
tf_logits = tf.matmul( tf_logits , tf_matrix2 )
tf_loss = -1 * learning_rate * ( tf_delta_reward + discount * tf_value - tf_logits ) * tf_action_1hot
tf_regularize = tf.reduce_mean( tf.square( tf_matrix1 )) + tf.reduce_mean( tf.square( tf_matrix2 ))
tf_train = tf.train.GradientDescentOptimizer(gradient).minimize( tf_loss + tf_regularize * regularizaiton_factor )
sess = tf.Session()
sess.run( tf.global_variables_initializer() )
def max_Q( state ) :
actions = sess.run( tf_logits, feed_dict={ tf_state:state } )
actions = actions[0]
value = actions.max()
action = 0 if actions[0] == value else 1
return action , value
avg_age = 0
for trial in range(1,101) :
# initialize state
previous_state = env.reset()
# initialize action and the value of the expected reward
action , value = max_Q(previous_state)
previous_reward = 0
for age in range(1,301) :
if trial % 100 == 0 :
env.render()
new_state, new_reward, done, info = env.step(action)
new_state = new_state
action, value = max_Q(new_state)
# The cart-pole gym doesn't return a reward of Zero when done.
if done :
new_reward = 0
delta_reward = new_reward - previous_reward
# learning phase
sess.run(tf_train, feed_dict={ tf_state:previous_state, tf_action:action, tf_delta_reward:delta_reward, tf_value:value })
previous_state = new_state
previous_reward = new_reward
if done :
break
avg_age = avg_age * 0.95 + age * .05
if trial % 50 == 0 :
print "Average age =",int(round(avg_age))," , trial",trial," , discount",discount," , learning_rate",learning_rate," , gradient",gradient
elif trial % 10 == 0 :
print int(round(avg_age)),
6 18 23 30 Average age = 36 , trial 50 , discount 0.5 , learning_rate 0.5 , gradient 0.001
38 47 50 53 Average age = 55 , trial 100 , discount 0.5 , learning_rate 0.5 , gradient 0.001