Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/tensorflow/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 阻止渐变在关节损失中流动_Python_Tensorflow_Gradient Descent_Backpropagation_Loss Function - Fatal编程技术网

Python 阻止渐变在关节损失中流动

Python 阻止渐变在关节损失中流动,python,tensorflow,gradient-descent,backpropagation,loss-function,Python,Tensorflow,Gradient Descent,Backpropagation,Loss Function,我有一个输入张量 data=tf.placeholder(tf.int32,[None]) 它将被嵌入到 embedding_matrix = tf.get_variable("embedding_matrix", [5,3], tf.float32, initializer=tf.random_normal_initializer()) input_vectors = tf.nn.embedding_lookup(params=embedding_matrix, ids=data) 我使用o

我有一个输入张量

data=tf.placeholder(tf.int32,[None])

它将被嵌入到

embedding_matrix = tf.get_variable("embedding_matrix", [5,3], tf.float32, initializer=tf.random_normal_initializer())
input_vectors = tf.nn.embedding_lookup(params=embedding_matrix, ids=data)
我使用
output1\u权重对输入向量执行线性变换,以获得
network\u output1

output1_weights = tf.get_variable("output1", [3,4], tf.float32, initializer=tf.random_normal_initializer())
network_output1 = tf.matmul(input_vectors, output1_weights)
损失将是非常标准的东西

loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=output1, logits=network_output1)
现在我想使用logits
网络输出1
作为输入来计算另一个线性变换

output2_weights = tf.get_variable("output2", [4,5], tf.float32, initializer=tf.random_normal_initializer())
network_output2 = tf.matmul(network_output1, output2_weights)
第二次输出的交叉熵损失

loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=output2, logits=network_output2)

这就是我想要实现的目标。在关节损耗设置中,当最小化
loss1
的损耗时,我只想支持
output1\u权重的梯度,当最小化
loss2
时,只支持
output2\u权重的梯度。换句话说,在优化
loss2
时,我不希望梯度全部流回到篡改
输出1_权重。我知道optimizer类中的
compute_gradients
函数可以接受一个参数
var_list
,但它似乎无法阻止梯度的流动,导致单独的损失。此外,我可以考虑分离损失和尽量减少它们,这也将是一个坏的解决方案在我的设置。 您所要做的就是选择一个可训练变量,并将其分配给
var\u列表

首先计算不同损失的可训练变量

import numpy as np
import tensorflow as tf

data = tf.placeholder(tf.int32, [None])
output1 = tf.placeholder(tf.int32, [None])
output2 = tf.placeholder(tf.int32, [None])

embedding_matrix = tf.get_variable("embedding_matrix", [5,3], tf.float32, initializer=tf.random_normal_initializer())
input_vectors = tf.nn.embedding_lookup(params=embedding_matrix, ids=data)
# count 
params_num0 = len(tf.trainable_variables())

output1_weights = tf.get_variable("output1", [3,4], tf.float32, initializer=tf.random_normal_initializer())
network_output1 = tf.matmul(input_vectors, output1_weights)
# count 
params_num1 = len(tf.trainable_variables())
loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=output1, logits=network_output1)

output2_weights = tf.get_variable("output2", [4,5], tf.float32, initializer=tf.random_normal_initializer())
network_output2 = tf.matmul(network_output1, output2_weights)
loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=output2, logits=network_output2)
然后打印它们和所有可训练变量

params = tf.trainable_variables()
print(params_num0)
print(params_num1)
print(params)

# 1
# 2
# [<tf.Variable 'embedding_matrix:0' shape=(5, 3) dtype=float32_ref>, <tf.Variable 'output1:0' shape=(3, 4) dtype=float32_ref>, <tf.Variable 'output2:0' shape=(4, 5) dtype=float32_ref>]
接下来,指定相应变量的更新梯度

opt = tf.train.AdamOptimizer(0.01)
grads_vars = opt.compute_gradients(loss1,var_list=params1)
grads_vars2 = opt.compute_gradients(loss2,var_list=params2)
print(grads_vars)
print(grads_vars2)

# [(<tf.Tensor 'gradients/MatMul_grad/tuple/control_dependency_1:0' shape=(3, 4) dtype=float32>, <tf.Variable 'output1:0' shape=(3, 4) dtype=float32_ref>)]
# [(<tf.Tensor 'gradients_1/MatMul_1_grad/tuple/control_dependency_1:0' shape=(4, 5) dtype=float32>, <tf.Variable 'output2:0' shape=(4, 5) dtype=float32_ref>)]
train_op = opt.apply_gradients(grads_vars+grads_vars2)
实验

data_np = np.random.normal(size=(100))
output1_np = np.random.randint(0,4,size=(100))
output2_np = np.random.randint(0,5,size=(100))
feed_dict_v = {data: data_np, output1: output1_np, output2: output2_np}

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for i in range(2):
        print("epoch:{}".format(i))
        sess.run(train_op, feed_dict=feed_dict_v)
        print("embedding_matrix value:\n",sess.run(embedding_matrix, feed_dict=feed_dict_v))
        print("output1_weights value:\n",sess.run(output1_weights, feed_dict=feed_dict_v))
        print("output2_weights value:\n",sess.run(output2_weights, feed_dict=feed_dict_v))
结果是:

epoch:0
embedding_matrix value:
 [[ 0.7646786  -0.44221798 -1.6374763 ]
 [-0.4061512  -0.70626575  0.09637168]
 [ 1.3499098   0.38479885 -0.10424987]
 [-1.3999717   0.67008936  1.8843309 ]
 [-0.11357951 -1.1893668   1.1205566 ]]
output1_weights value:
 [[-0.22709225  0.70598644  0.10429419 -2.2737694 ]
 [-0.6364337  -0.08602498  1.9750406   0.8664075 ]
 [ 0.3656631  -0.25182125 -0.14689662 -0.03764082]]
output2_weights value:
 [[ 0.00554644 -0.49370843 -0.75148153  0.6645286   1.0131303 ]
 [ 0.21612553  0.07851358  0.05937392 -0.3236267  -0.8081816 ]
 [ 0.82237226  0.17242427 -1.3059226  -1.1134574   0.22402465]
 [-1.6996336  -0.58993673 -0.7071007   0.8407903   0.62416744]]
epoch:1
embedding_matrix value:
 [[ 0.7646786  -0.44221798 -1.6374763 ]
 [-0.4061512  -0.70626575  0.09637168]
 [ 1.3499098   0.38479885 -0.10424987]
 [-1.3999717   0.67008936  1.8843309 ]
 [-0.11357951 -1.1893668   1.1205566 ]]
output1_weights value:
 [[-0.21710345  0.6959941   0.11408082 -2.2637703 ]
 [-0.64639646 -0.07603455  1.9650643   0.85640883]
 [ 0.35567763 -0.24182947 -0.15682784 -0.04763966]]
output2_weights value:
 [[ 0.01553426 -0.5036415  -0.7415529   0.65454334  1.003145  ]
 [ 0.20613036  0.08847766  0.04942677 -0.31363514 -0.7981894 ]
 [ 0.8323502   0.16245098 -1.2959852  -1.1234138   0.21408063]
 [-1.6896346  -0.59990865 -0.6971453   0.8307945   0.6141711 ]]
您可以看到
嵌入矩阵
从未更改。
输出1\u权重
输出2\u权重
仅更新相应的渐变

添加

实际上,您可以在
output2\u权重上组合
loss1
loss2
。例如:

grads_vars3 = opt.compute_gradients(loss1+loss2,var_list=params2)
当通过加法组合
loss1
loss2
时,您会发现
grads\u vars2
grads\u vars3
是相等的。原因是
loss1
的梯度不会流向
loss1+loss2
中的
output2\u权重。但在以下情况下,
grads\u vars2
grads\u vars3
在通过乘法组合
loss1
loss2
时不相等

grads_vars3 = opt.compute_gradients(loss1*loss2,var_list=params2)
上述情况意味着我们可以根据自己的需要将相应可训练变量的损失合并起来

在您的场景中,
网络输出2
需要使用
网络输出1
,因此我们必须指定损耗。如果
网络输出2
不依赖于
网络输出1
,我们可以直接优化
loss1+loss2

关于渐变

input = tf.constant([[1,2,3]],tf.float32)
label1 = tf.constant([[1,2,3,4]],tf.float32)
label2 = tf.constant([[1,2,3,4,5]],tf.float32)

weight1 = tf.reshape(tf.range(12,dtype=tf.float32),[3,4])
output1 = tf.matmul(input , weight1)
loss1 = tf.reduce_sum(output1 - label1)

weight2 = tf.reshape(tf.range(20,dtype=tf.float32),[4,5])
output2 = tf.matmul(output1 , weight2)
loss2 = tf.reduce_sum(output2 - label2)

grad1 = tf.gradients(loss1,weight1)
grad2 = tf.gradients(loss2,weight2)
grad3 = tf.gradients(loss1+loss2,weight2)

with tf.Session() as sess:
    print(sess.run(grad1))
    print(sess.run(grad2))
    print(sess.run(grad3))

# [array([[1., 1., 1., 1.],
#        [2., 2., 2., 2.],
#        [3., 3., 3., 3.]], dtype=float32)]
# [array([[32., 32., 32., 32., 32.],
#        [38., 38., 38., 38., 38.],
#        [44., 44., 44., 44., 44.],
#        [50., 50., 50., 50., 50.]], dtype=float32)]
# [array([[32., 32., 32., 32., 32.],
#        [38., 38., 38., 38., 38.],
#        [44., 44., 44., 44., 44.],
#        [50., 50., 50., 50., 50.]], dtype=float32)]

我想我很清楚,我想要一个共同的损失,而不是单独的损失,我已经在我的问题中提出了一个解决方案,我也可以考虑分开的损失和尽量减少它们,这也将是一个坏的解决方案,在我的设置。顺便说一下,当最小化
损失1
@user1935724时,您不需要指定
变量列表,您应该首先描述
共同损失
损失1
损失2
之间的数学关系。加法还是乘法?我认为共同损失意味着加法。非常感谢更新!这是否意味着我们真的没有办法先把损失合并起来,然后控制梯度的流动。@user1935724请看我补充的答案
input = tf.constant([[1,2,3]],tf.float32)
label1 = tf.constant([[1,2,3,4]],tf.float32)
label2 = tf.constant([[1,2,3,4,5]],tf.float32)

weight1 = tf.reshape(tf.range(12,dtype=tf.float32),[3,4])
output1 = tf.matmul(input , weight1)
loss1 = tf.reduce_sum(output1 - label1)

weight2 = tf.reshape(tf.range(20,dtype=tf.float32),[4,5])
output2 = tf.matmul(output1 , weight2)
loss2 = tf.reduce_sum(output2 - label2)

grad1 = tf.gradients(loss1,weight1)
grad2 = tf.gradients(loss2,weight2)
grad3 = tf.gradients(loss1+loss2,weight2)

with tf.Session() as sess:
    print(sess.run(grad1))
    print(sess.run(grad2))
    print(sess.run(grad3))

# [array([[1., 1., 1., 1.],
#        [2., 2., 2., 2.],
#        [3., 3., 3., 3.]], dtype=float32)]
# [array([[32., 32., 32., 32., 32.],
#        [38., 38., 38., 38., 38.],
#        [44., 44., 44., 44., 44.],
#        [50., 50., 50., 50., 50.]], dtype=float32)]
# [array([[32., 32., 32., 32., 32.],
#        [38., 38., 38., 38., 38.],
#        [44., 44., 44., 44., 44.],
#        [50., 50., 50., 50., 50.]], dtype=float32)]