Python 3.x TensorFlow检查点恢复学习率

Python 3.x TensorFlow检查点恢复学习率,python-3.x,tensorflow,tensorflow2.0,checkpoint,Python 3.x,Tensorflow,Tensorflow2.0,Checkpoint,我正在尝试使用TensorFlow检查点,除了学习率之外,一切都很好。它在每次运行时都会重新初始化,并且不会从上一次运行恢复 下面是一个玩具示例,我正试图复制这个问题: import numpy as np import tensorflow as tf X = tf.range(10.) Y = 50.*X class CGMM(tf.Module): def __init__(self): super(CGMM, self).__init__(name='C

我正在尝试使用
TensorFlow检查点
,除了
学习率
之外,一切都很好。它在每次运行时都会重新初始化,并且不会从上一次运行恢复

下面是一个玩具示例,我正试图复制这个问题:

import numpy as np
import tensorflow as tf
  

X = tf.range(10.)
Y = 50.*X

class CGMM(tf.Module):
    def __init__(self):
        super(CGMM, self).__init__(name='CGMM')
        self.beta =  tf.Variable(1. , dtype=np.float32)
        self.learning_rate =  tf.Variable(1. , dtype=np.float32)

    @tf.function
    def objfun(self):
        beta = self.beta
        obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))
        return obj

    def build_model(self,X,Y,decay_steps):

        self.X,self.Y=X,Y
        starter_learning_rate = 0.05 #0.05
        global_step = tf.Variable(1, trainable=False)
        self.learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,decay_steps, 0.96, staircase=True)

        optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learning_rate)

        ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
        manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)

        ckpt.restore(manager.latest_checkpoint)
        if manager.latest_checkpoint:
            print("Restored from {}".format(manager.latest_checkpoint))
        else:
            print("Initializing from scratch.")

        for i in range(500):
            optimizer.minimize(self.objfun, global_step=global_step,  var_list =  [self.beta])
            loss, beta, learning_rate = self.objfun(), self.beta, self.learning_rate().numpy()

            ckpt.step.assign_add(1)
            if (int(ckpt.step)-1) % 100 == 0:
                save_path = manager.save()
                print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
                print("learning_rate : " + str(learning_rate))

        return beta


model = CGMM()
opt_beta = model.build_model(X,Y,100)
第一次运行的结果:

Initializing from scratch.
Saved checkpoint for step 101: ./tf_ckpts_cg/ckpt-1
learning_rate : 0.048
Saved checkpoint for step 201: ./tf_ckpts_cg/ckpt-2
learning_rate : 0.04608
Saved checkpoint for step 301: ./tf_ckpts_cg/ckpt-3
learning_rate : 0.044236798
Saved checkpoint for step 401: ./tf_ckpts_cg/ckpt-4
learning_rate : 0.042467322
Saved checkpoint for step 501: ./tf_ckpts_cg/ckpt-5
learning_rate : 0.04076863
第二轮结果:

Restored from ./tf_ckpts_cg/ckpt-5
Saved checkpoint for step 601: ./tf_ckpts_cg/ckpt-6
learning_rate : 0.048
Saved checkpoint for step 701: ./tf_ckpts_cg/ckpt-7
learning_rate : 0.04608
Saved checkpoint for step 801: ./tf_ckpts_cg/ckpt-8
learning_rate : 0.044236798
Saved checkpoint for step 901: ./tf_ckpts_cg/ckpt-9
learning_rate : 0.042467322
Saved checkpoint for step 1001: ./tf_ckpts_cg/ckpt-10
learning_rate : 0.04076863

正如您所见,在两次运行中重复相同的
学习率
,但其他变量运行良好。你能帮我解决这个问题吗?

我找到了解决办法,并打算离开这篇文章,也许这对将来的人会有帮助

必须添加
self.global\u step=tf.Variable(1,trainable=False)

这里是完整的脚本

import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt


X = tf.range(10.)
Y = 50.*X

class CGMM(tf.Module):
    def __init__(self):
        super(CGMM, self).__init__(name='CGMM')
        self.beta =  tf.Variable(1. , dtype=np.float32)
        self.global_step = tf.Variable(1, trainable=False)

    @tf.function
    def objfun(self):
        beta = self.beta
        obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))

        return obj

    def build_model(self,X,Y,decay_steps):
        self.X,self.Y=X,Y
        starter_learning_rate = 0.05 

        learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, self.global_step,decay_steps, 0.96, staircase=True)

        optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate)

        ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
        manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)

        ckpt.restore(manager.latest_checkpoint)
        if manager.latest_checkpoint:
            print("Restored from {}".format(manager.latest_checkpoint))
        else:
            print("Initializing from scratch.")

        for i in range(500):
            optimizer.minimize(self.objfun, global_step=self.global_step,  var_list =  [self.beta])
            loss, beta = self.objfun(), self.beta

            ckpt.step.assign_add(1)
            lr_vector.append(learning_rate().numpy())
            if (int(ckpt.step)-1) % 100 == 0:
                save_path = manager.save()
                print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
                print("learning_rate : " + str(learning_rate().numpy()))

        return loss_vec,beta,lr_vector

model = CGMM()
loss_vec, opt_beta,lr_vector = model.build_model(X,Y,100)