Python ValueError:尝试在tf.distribute.Strategy的作用域下创建优化器插槽变量
请查找下面的代码,将图像分类为2类,我正在尝试使用Kaggle TPU执行。你能帮我解决这个问题吗? 为了使用GPU,我遵循了Kaggle网站的指南,但仍然没有运气 下面是代码生成的错误堆栈Python ValueError:尝试在tf.distribute.Strategy的作用域下创建优化器插槽变量,python,tensorflow,keras,kaggle,tpu,Python,Tensorflow,Keras,Kaggle,Tpu,请查找下面的代码,将图像分类为2类,我正在尝试使用Kaggle TPU执行。你能帮我解决这个问题吗? 为了使用GPU,我遵循了Kaggle网站的指南,但仍然没有运气 下面是代码生成的错误堆栈 import tensorflow as tf tpu = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initi
import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
print(tpu_strategy)
# save the final model to file
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.preprocessing.image import ImageDataGenerator
# define cnn model
def define_model():
with tpu_strategy.scope():
# load model
model = VGG16(include_top=False, input_shape=(224, 224, 3))
# mark loaded layers as not trainable
for layer in model.layers:
layer.trainable = False
# add new classifier layers
flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
output = Dense(1, activation='sigmoid')(class1)
# define new model
model = Model(inputs=model.inputs, outputs=output)
# compile model
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True),
loss='binary_crossentropy', metrics=['accuracy'])
return model
# run the test harness for evaluating a model
def train():
# define model
model = define_model()
# create data generator
datagen = ImageDataGenerator(featurewise_center=True)
# specify imagenet mean values for centering
datagen.mean = [123.68, 116.779, 103.939]
# prepare iterator
train_it = datagen.flow_from_directory('/kaggle/working/train/',
class_mode='binary', batch_size=64, target_size=(224, 224))
# fit model
model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
# save model
model.save('final_model.h5')
# entry point, run the test harness
train()
错误堆栈:
Found 25000 images belonging to 2 classes.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-b7b93eb12fab> in <module>
42
43 # entry point, run the test harness
---> 44 train()
<ipython-input-13-b7b93eb12fab> in train()
37 class_mode='binary', batch_size=64, target_size=(224, 224))
38 # fit model
---> 39 model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
40 # save model
41 model.save('final_model.h5')
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1730 use_multiprocessing=use_multiprocessing,
1731 shuffle=shuffle,
-> 1732 initial_epoch=initial_epoch)
1733
1734 @interfaces.legacy_generator_methods_support
/opt/conda/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
40
41 do_validation = bool(validation_data)
---> 42 model._make_train_function()
43 if do_validation:
44 model._make_test_function()
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
760 def get_updates(self, loss, params):
761 if isinstance(self.optimizer, tf.keras.optimizers.Optimizer):
--> 762 return self.optimizer.get_updates(loss, params)
763 else:
764 grads = self.optimizer.compute_gradients(loss, var_list=params)
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
507 if g is not None and v.dtype != dtypes.resource
508 ])
--> 509 return [self.apply_gradients(grads_and_vars)]
510
511 def _set_hyper(self, name, value):
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in apply_gradients(self, grads_and_vars, name)
432 _ = self.iterations
433 self._create_hypers()
--> 434 self._create_slots(var_list)
435
436 if not grads_and_vars:
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/gradient_descent.py in _create_slots(self, var_list)
98 if self._momentum:
99 for var in var_list:
--> 100 self.add_slot(var, "momentum")
101
102 def _prepare_local(self, var_device, var_dtype, apply_state):
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in add_slot(self, var, slot_name, initializer)
590 "variables are created under the same strategy scope. This may "
591 "happen if you're restoring from a checkpoint outside the scope"
--> 592 .format(strategy, var))
593
594 with strategy.extended.colocate_vars_with(var):
ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7f3e60656050>), which is different from the scope used for the original variable (TPUMirroredVariable:{
0 /job:worker/replica:0/task:0/device:TPU:0: <tf.Variable 'dense_17/kernel:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>,
1 /job:worker/replica:0/task:0/device:TPU:1: <tf.Variable 'dense_17/kernel/replica_1:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>,
7 /job:worker/replica:0/task:0/device:TPU:7: <tf.Variable 'dense_17/kernel/replica_7:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>
}). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
找到了25000张属于2类的图像。
---------------------------------------------------------------------------
ValueError回溯(最近一次调用上次)
在里面
42
43#进入点,运行测试线束
--->44列车()
列车上
37 class_mode='binary',batch_size=64,target_size=(224224))
38#适合模型
--->39型号。安装发电机(列车运行时间,每个历元的步数=len(列车运行时间),历元数=10,详细度=0)
40#保存模型
41型号保存(“最终型号h5”)
/包装中的opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py(*args,**kwargs)
89 warnings.warn('Update your`'+object\u name+'`调用+
90'Keras 2 API:'+签名,堆栈级别=2)
--->91返回函数(*args,**kwargs)
92包装器._原始函数=func
93返回包装器
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py-in-fit\u生成器(self、生成器、每个历元的步骤、历元、冗余、回调、验证数据、验证步骤、验证频率、类权重、最大队列大小、工作者、使用多处理、无序、初始历元)
1730使用多处理=使用多处理,
1731洗牌=洗牌,
->1732初始_历元=初始_历元)
1733
1734@interfaces.legacy\u生成器\u方法\u支持
/opt/conda/lib/python3.7/site-packages/keras/engine/training\u generator.py in-fit\u generator(模型、生成器、每个历元的步骤、历元、冗余、回调、验证数据、验证步骤、验证频率、类权重、最大队列大小、工人、使用多处理、无序、初始历元)
40
41 do\U验证=bool(验证数据)
--->42型号.\u制造\u训练\u功能()
43如果进行验证:
44型号。_制造_测试_功能()
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in_make_train_函数(self)
314培训\u更新=self.optimizer.get\u更新(
315参数=自身收集的可训练重量,
-->316损耗=自身总损耗)
317更新=自我更新+培训更新
318
/包装中的opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py(*args,**kwargs)
89 warnings.warn('Update your`'+object\u name+'`调用+
90'Keras 2 API:'+签名,堆栈级别=2)
--->91返回函数(*args,**kwargs)
92包装器._原始函数=func
93返回包装器
/符号包装中的opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py(*args,**kwargs)
73如果符号范围值:
74带有get_graph()。作为_default():
--->75返回函数(*args,**kwargs)
76.其他:
77返回函数(*args,**kwargs)
/获取更新中的opt/conda/lib/python3.7/site-packages/keras/optimizers.py(self、loss、params)
760 def get_更新(自我、丢失、参数):
761如果存在(self.optimizer、tf.keras.optimizer.optimizer):
-->762返回self.optimizer.get_更新(丢失,参数)
763其他:
764梯度=self.optimizer.compute_梯度(损失,变量列表=参数)
/get_更新中的opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py(self、loss、params)
507如果g不是None且v.dtype!=数据类型资源
508 ])
-->509返回[自应用梯度(梯度和变量)]
510
511定义集超(自身、名称、值):
/应用梯度中的opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/optimizer\u v2.py(self、grads\u和_vars、name)
432=自迭代
433 self.\u create\u hypers()
-->434自创建插槽(变量列表)
435
436如果不是年级和变量:
/opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/gradient\u dence.py in\u create\u slot(self,var\u list)
98如果自动量:
99对于var_列表中的var:
-->100自加槽(变量,“动量”)
101
102定义准备本地(自身、变量设备、变量数据类型、应用状态):
/添加插槽中的opt/conda/lib/python3.7/site-packages/tensorflow\u core/python/keras/optimizer\u v2/optimizer\u v2.py(self、var、插槽名称、初始值设定项)
590“在同一策略范围下创建变量。这可能”
591“如果从范围外的检查点进行恢复,则会发生此情况”
-->592.格式(策略、风险值))
593
594带有策略。扩展。将变量与(var)合并:
ValueError:尝试在tf.distribute.Strategy()的作用域下创建优化器插槽变量,该变量与原始变量(TPUMirroredVariable:{
0/作业:工作/副本:0/任务:0/设备:TPU:0:,
1/作业:工作/副本:0/任务:0/设备:TPU:1:,
7/作业:工作/副本:0/任务:0/设备:TPU:7:
}). 确保在相同的策略范围下创建插槽变量。如果从范围外的检查点进行恢复,则可能会发生这种情况
错误看起来有点像这里描述的错误:
查看您的错误跟踪,它(已编辑)正在抱怨:
ValueError:尝试在作用域下创建优化器插槽变量
对于tf.distribute.Strategy,这与
def define_model():
with tpu_strategy.scope():
# load model
model = VGG16(include_top=False, input_shape=(224, 224, 3))
# mark loaded layers as not trainable
for layer in model.layers:
layer.trainable = False
# add new classifier layers
flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
output = Dense(1, activation='sigmoid')(class1)
# define new model
model = Model(inputs=model.inputs, outputs=output)
# compile model
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True),
loss='binary_crossentropy', metrics=['accuracy'])
return model