Tensorflow急切执行不适用于学习率衰减

Tensorflow急切执行不适用于学习率衰减,tensorflow,Tensorflow,在这里尝试使一个热切的exec模型与LR decay一起工作,但没有成功。这似乎是一个错误,因为学习率衰减张量似乎没有得到更新。如果我错过了什么,你能帮我一把吗。谢谢 下面的代码学习一些单词嵌入。但是,学习速率衰减部分根本不起作用 class Word2Vec(tf.keras.Model): def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED): self.vocab_size = voc

在这里尝试使一个热切的exec模型与LR decay一起工作,但没有成功。这似乎是一个错误,因为学习率衰减张量似乎没有得到更新。如果我错过了什么,你能帮我一把吗。谢谢

下面的代码学习一些单词嵌入。但是,学习速率衰减部分根本不起作用

class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
        self.vocab_size = vocab_size
        self.num_sampled = num_sampled
        self.embed_matrix = tfe.Variable(tf.random_uniform(
            [vocab_size, embed_size]), name="embedding_matrix")
        self.nce_weight = tfe.Variable(tf.truncated_normal(
            [vocab_size, embed_size],
            stddev=1.0 / (embed_size ** 0.5)), name="weights")
        self.nce_bias = tfe.Variable(tf.zeros([vocab_size]), name="biases")

    def compute_loss(self, center_words, target_words):
        """Computes the forward pass of word2vec with the NCE loss."""
        embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
                                             biases=self.nce_bias,
                                             labels=target_words,
                                             inputs=embed,
                                             num_sampled=self.num_sampled,
                                             num_classes=self.vocab_size))
        return loss


def gen():
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES,
                                        VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW,
                                        VISUAL_FLD)


def main():
    dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
                                             (tf.TensorShape([BATCH_SIZE]),
                                              tf.TensorShape([BATCH_SIZE, 1])))

    global_step = tf.train.get_or_create_global_step()
    starter_learning_rate = 1.0
    end_learning_rate = 0.01
    decay_steps = 1000
    learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
                                              decay_steps, end_learning_rate,
                                              power=0.5)

    train_writer = tf.contrib.summary.create_file_writer('./checkpoints')
    train_writer.set_as_default()

    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
    model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
    grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
    total_loss = 0.0  # for average loss in the last SKIP_STEP steps

    checkpoint_dir = "./checkpoints/"
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    root = tfe.Checkpoint(optimizer=optimizer,
                          model=model,
                          optimizer_step=tf.train.get_or_create_global_step())

    while global_step < NUM_TRAIN_STEPS:

        for center_words, target_words in tfe.Iterator(dataset):

            with tf.contrib.summary.record_summaries_every_n_global_steps(100):

                if global_step >= NUM_TRAIN_STEPS:
                    break

                loss_batch, grads = grad_fn(center_words, target_words)
                tf.contrib.summary.scalar('loss', loss_batch)
                tf.contrib.summary.scalar('learning_rate', learning_rate)

                # print(grads)
                # print(len(grads))
                total_loss += loss_batch
                optimizer.apply_gradients(grads, global_step)
                if (global_step.numpy() + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.1f}'.format(
                        global_step.numpy(), total_loss / SKIP_STEP))
                    total_loss = 0.0

        root.save(file_prefix=checkpoint_prefix)

if __name__ == '__main__':
    main()
classword2vec(tf.keras.Model):
def uuu init uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu(自、语音大小、嵌入大小、采样数=采样数):
self.vocab_size=vocab_size
self.num\u sampled=num\u sampled
self.embed_matrix=tfe.Variable(tf.random_uniform(
[vocab_size,embedded_size]),name=“embedded_matrix”)
self.nce\u weight=tfe.Variable(tf.truncated\u normal(
[声音大小,嵌入大小],
stddev=1.0/(嵌入尺寸**0.5)),name=“权重”)
self.nce_bias=tfe.Variable(tf.zeros([vocab_size]),name=“bias”)
def计算损失(自身、中心词、目标词):
“”“使用NCE损失计算word2vec的正向传递。”“”
嵌入=tf.nn.embedded\u查找(self.embed\u矩阵,中心词)
损失=tf.减少平均值(tf.nn.nce)损失(重量=自身重量,
偏差=自我偏差,
标签=目标词,
输入=嵌入,
num_sampled=self.num_sampled,
num_classes=self.vocab_size)
回波损耗
def gen():
从word2vec_utils.batch_gen(下载URL,预期字节,
声音大小、批次大小、跳过窗口、,
目视检查(FLD)
def main():
dataset=tf.data.dataset.from_生成器(gen,(tf.int32,tf.int32),
(tf.张力形状([批次尺寸]),
tf.TensorShape([批次大小,1]))
全局\u步骤=tf.train.get\u或\u create\u全局\u步骤()
初学者学习率=1.0
结束学习率=0.01
衰减步数=1000
学习速率=tf.train.polyman衰减(起始学习速率,全局步长.numpy(),
衰减步数,结束学习率,
功率=0.5)
train_writer=tf.contrib.summary.create_file_writer(“./checkpoints”)
训练编写器。将编写器设置为默认值()
优化器=tf.训练动量优化器(学习速率,动量=0.95)
model=Word2Vec(声音大小=声音大小,嵌入大小=嵌入大小)
梯度fn=tfe.隐式值和梯度(模型计算损失)
总损耗=0.0,表示最后跳过步骤的平均损耗
checkpoint_dir=“./checkpoints/”
checkpoint\u prefix=os.path.join(checkpoint\u dir,“ckpt”)
root=tfe.Checkpoint(优化器=optimizer,
模型=模型,
优化器\u步骤=tf.train.get\u或\u create\u global\u步骤()
当全局步骤=NUM\u列\u步数:
打破
损失批次,梯度=梯度fn(中心词,目标词)
tf.contrib.summary.scalar('损失',损失批次)
tf.contrib.summary.scalar(“学习率”,学习率)
#印刷(毕业生)
#印刷品(len(grads))
总损耗+=损耗批次
优化器。应用梯度(梯度、全局梯度)
如果(全局_-step.numpy()+1)%SKIP_-step==0:
打印({}:{:5.1f}步的平均损耗)。格式(
全局_step.numpy(),总_损耗/跳过_step))
总损耗=0.0
保存(文件前缀=检查点前缀)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
main()

请注意,当启用“急切执行”时,
tf.Tensor
对象(与将在
Session.run()
调用中出现的计算符号句柄相反)

因此,在上面的代码片段中,行:

learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
                                          decay_steps, end_learning_rate,
                                          power=0.5)
使用调用时的
global\u步骤
计算衰减值一次,并且在创建优化器时使用:

optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
这是一个固定的学习率

要衰减学习速率,您需要反复调用
tf.train.polymonery\u decay
(更新了
全局步骤的值)。实现这一点的一种方法是复制在中完成的操作,使用如下内容:

starter_learning_rate = 1.0
learning_rate = tfe.Variable(starter_learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
while global_step < NUM_TRAIN_STEPS:
   # ....
   learning_rate.assign(tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5))
starter\u learning\u rate=1.0
学习率=tfe.变量(初学者学习率)
优化器=tf.训练动量优化器(学习速率,动量=0.95)
当全局步骤
这样,您就可以在一个可以更新的变量中捕获
学习率。此外,在检查点中包含当前的
learning\u rate
也很简单(通过在创建
checkpoint
对象时包含它)


希望能有帮助。

是的,这帮了大忙。谢谢。@ash,对于我们不直接访问训练循环的TF估计器,有没有办法做到这一点?对于TF2中的渴望模式,此解决方案不起作用。正确的解决方案是使学习率成为可调用的,按。通过使用可变学习率实例化优化器,更改学习率的值,并调用优化器的get_config()方法,可以确认可变方法不起作用。另一方面,可调用方法将更新优化器的学习率。