Python ValueError:尝试在tf.distribute.Strategy的作用域下创建优化器插槽变量

Python ValueError:尝试在tf.distribute.Strategy的作用域下创建优化器插槽变量,python,tensorflow,keras,kaggle,tpu,Python,Tensorflow,Keras,Kaggle,Tpu,请查找下面的代码,将图像分类为2类,我正在尝试使用Kaggle TPU执行。你能帮我解决这个问题吗? 为了使用GPU,我遵循了Kaggle网站的指南,但仍然没有运气 下面是代码生成的错误堆栈 import tensorflow as tf tpu = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initi

请查找下面的代码,将图像分类为2类,我正在尝试使用Kaggle TPU执行。你能帮我解决这个问题吗? 为了使用GPU,我遵循了Kaggle网站的指南,但仍然没有运气

下面是代码生成的错误堆栈

import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
print(tpu_strategy)
# save the final model to file
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.preprocessing.image import ImageDataGenerator

# define cnn model
def define_model():
    with tpu_strategy.scope():
        # load model
        model = VGG16(include_top=False, input_shape=(224, 224, 3))
        # mark loaded layers as not trainable
        for layer in model.layers:
            layer.trainable = False
        # add new classifier layers
        flat1 = Flatten()(model.layers[-1].output)
        class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
        output = Dense(1, activation='sigmoid')(class1)
        # define new model
        model = Model(inputs=model.inputs, outputs=output)
        # compile model
        model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True), 
                      loss='binary_crossentropy', metrics=['accuracy'])
        return model

# run the test harness for evaluating a model
def train():
    # define model
    model = define_model()
    # create data generator
    datagen = ImageDataGenerator(featurewise_center=True)
    # specify imagenet mean values for centering
    datagen.mean = [123.68, 116.779, 103.939]
    # prepare iterator
    train_it = datagen.flow_from_directory('/kaggle/working/train/',
                                           class_mode='binary', batch_size=64, target_size=(224, 224))
    # fit model
    model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
    # save model
    model.save('final_model.h5')

# entry point, run the test harness
train()
错误堆栈:

Found 25000 images belonging to 2 classes.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-b7b93eb12fab> in <module>
     42 
     43 # entry point, run the test harness
---> 44 train()

<ipython-input-13-b7b93eb12fab> in train()
     37                                            class_mode='binary', batch_size=64, target_size=(224, 224))
     38     # fit model
---> 39     model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
     40     # save model
     41     model.save('final_model.h5')

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1730             use_multiprocessing=use_multiprocessing,
   1731             shuffle=shuffle,
-> 1732             initial_epoch=initial_epoch)
   1733 
   1734     @interfaces.legacy_generator_methods_support

/opt/conda/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
     40 
     41     do_validation = bool(validation_data)
---> 42     model._make_train_function()
     43     if do_validation:
     44         model._make_test_function()

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
    314                     training_updates = self.optimizer.get_updates(
    315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
    317                 updates = self.updates + training_updates
    318 

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
     73         if _SYMBOLIC_SCOPE.value:
     74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
     76         else:
     77             return func(*args, **kwargs)

/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
    760     def get_updates(self, loss, params):
    761         if isinstance(self.optimizer, tf.keras.optimizers.Optimizer):
--> 762             return self.optimizer.get_updates(loss, params)
    763         else:
    764             grads = self.optimizer.compute_gradients(loss, var_list=params)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
    507         if g is not None and v.dtype != dtypes.resource
    508     ])
--> 509     return [self.apply_gradients(grads_and_vars)]
    510 
    511   def _set_hyper(self, name, value):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in apply_gradients(self, grads_and_vars, name)
    432         _ = self.iterations
    433         self._create_hypers()
--> 434         self._create_slots(var_list)
    435 
    436       if not grads_and_vars:

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/gradient_descent.py in _create_slots(self, var_list)
     98     if self._momentum:
     99       for var in var_list:
--> 100         self.add_slot(var, "momentum")
    101 
    102   def _prepare_local(self, var_device, var_dtype, apply_state):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in add_slot(self, var, slot_name, initializer)
    590             "variables are created under the same strategy scope. This may "
    591             "happen if you're restoring from a checkpoint outside the scope"
--> 592             .format(strategy, var))
    593 
    594       with strategy.extended.colocate_vars_with(var):

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7f3e60656050>), which is different from the scope used for the original variable (TPUMirroredVariable:{
  0 /job:worker/replica:0/task:0/device:TPU:0: <tf.Variable 'dense_17/kernel:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  1 /job:worker/replica:0/task:0/device:TPU:1: <tf.Variable 'dense_17/kernel/replica_1:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  7 /job:worker/replica:0/task:0/device:TPU:7: <tf.Variable 'dense_17/kernel/replica_7:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>
}). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
找到了25000张属于2类的图像。
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
在里面
42
43#进入点,运行测试线束
--->44列车()
列车上
37 class_mode='binary',batch_size=64,target_size=(224224))
38#适合模型
--->39型号。安装发电机(列车运行时间,每个历元的步数=len(列车运行时间),历元数=10,详细度=0)
40#保存模型
41型号保存(“最终型号h5”)
/包装中的opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py(*args,**kwargs)
89 warnings.warn('Update your`'+object\u name+'`调用+
90'Keras 2 API:'+签名,堆栈级别=2)
--->91返回函数(*args,**kwargs)
92包装器._原始函数=func
93返回包装器
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py-in-fit\u生成器(self、生成器、每个历元的步骤、历元、冗余、回调、验证数据、验证步骤、验证频率、类权重、最大队列大小、工作者、使用多处理、无序、初始历元)
1730使用多处理=使用多处理,
1731洗牌=洗牌,
->1732初始_历元=初始_历元)
1733
1734@interfaces.legacy\u生成器\u方法\u支持
/opt/conda/lib/python3.7/site-packages/keras/engine/training\u generator.py in-fit\u generator(模型、生成器、每个历元的步骤、历元、冗余、回调、验证数据、验证步骤、验证频率、类权重、最大队列大小、工人、使用多处理、无序、初始历元)
40
41 do\U验证=bool(验证数据)
--->42型号.\u制造\u训练\u功能()
43如果进行验证:
44型号。_制造_测试_功能()
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in_make_train_函数(self)
314培训\u更新=self.optimizer.get\u更新(
315参数=自身收集的可训练重量,
-->316损耗=自身总损耗)
317更新=自我更新+培训更新
318
/包装中的opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py(*args,**kwargs)
89 warnings.warn('Update your`'+object\u name+'`调用+
90'Keras 2 API:'+签名,堆栈级别=2)
--->91返回函数(*args,**kwargs)
92包装器._原始函数=func
93返回包装器
/符号包装中的opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py(*args,**kwargs)
73如果符号范围值:
74带有get_graph()。作为_default():
--->75返回函数(*args,**kwargs)
76.其他:
77返回函数(*args,**kwargs)
/获取更新中的opt/conda/lib/python3.7/site-packages/keras/optimizers.py(self、loss、params)
760 def get_更新(自我、丢失、参数):
761如果存在(self.optimizer、tf.keras.optimizer.optimizer):
-->762返回self.optimizer.get_更新(丢失,参数)
763其他:
764梯度=self.optimizer.compute_梯度(损失,变量列表=参数)
/get_更新中的opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py(self、loss、params)
507如果g不是None且v.dtype!=数据类型资源
508     ])
-->509返回[自应用梯度(梯度和变量)]
510
511定义集超(自身、名称、值):
/应用梯度中的opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/optimizer\u v2.py(self、grads\u和_vars、name)
432=自迭代
433 self.\u create\u hypers()
-->434自创建插槽(变量列表)
435
436如果不是年级和变量:
/opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/gradient\u dence.py in\u create\u slot(self,var\u list)
98如果自动量:
99对于var_列表中的var:
-->100自加槽(变量,“动量”)
101
102定义准备本地(自身、变量设备、变量数据类型、应用状态):
/添加插槽中的opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/optimizer\u v2.py(self、var、插槽名称、初始值设定项)
590“在同一策略范围下创建变量。这可能”
591“如果从范围外的检查点进行恢复,则会发生此情况”
-->592.格式(策略、风险值))
593
594带有策略。扩展。将变量与(var)合并:
ValueError:尝试在tf.distribute.Strategy()的作用域下创建优化器插槽变量,该变量与原始变量(TPUMirroredVariable:{
0/作业:工作/副本:0/任务:0/设备:TPU:0:,
1/作业:工作/副本:0/任务:0/设备:TPU:1:,
7/作业:工作/副本:0/任务:0/设备:TPU:7:
}). 确保在相同的策略范围下创建插槽变量。如果从范围外的检查点进行恢复,则可能会发生这种情况

错误看起来有点像这里描述的错误:

查看您的错误跟踪,它(已编辑)正在抱怨:

ValueError:尝试在作用域下创建优化器插槽变量 对于tf.distribute.Strategy,这与
def define_model():
    with tpu_strategy.scope():
        # load model
        model = VGG16(include_top=False, input_shape=(224, 224, 3))
        # mark loaded layers as not trainable
        for layer in model.layers:
            layer.trainable = False
        # add new classifier layers
        flat1 = Flatten()(model.layers[-1].output)
        class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
        output = Dense(1, activation='sigmoid')(class1)
        # define new model
        model = Model(inputs=model.inputs, outputs=output)
        # compile model
    model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True), 
                      loss='binary_crossentropy', metrics=['accuracy'])
   return model