验证和培训期间TensorFlow中的批量标准化
我认为我在培训与验证过程中使用批处理规范化时犯了一些错误。这是因为验证损失不会减少,并且验证错误始终为1.0 如果你能帮我把事情做好,我将不胜感激 我的TensorFlow模型声明如下验证和培训期间TensorFlow中的批量标准化,tensorflow,Tensorflow,我认为我在培训与验证过程中使用批处理规范化时犯了一些错误。这是因为验证损失不会减少,并且验证错误始终为1.0 如果你能帮我把事情做好,我将不胜感激 我的TensorFlow模型声明如下 import tensorflow as tf class OverFeatAccurateBase(object): def __init__(self, minibatch, numclasses): self._numclasses = numclasses se
import tensorflow as tf
class OverFeatAccurateBase(object):
def __init__(self, minibatch, numclasses):
self._numclasses = numclasses
self._trainmode = tf.placeholder(tf.bool)
self._logits = self._buildmodel(minibatch)
@property
def numclasses(self):
return self._numclasses
@property
def mode(self):
return self._trainmode
@property
def logits(self):
return self._logits
def _bn(self, input, is_training, name):
out = tf.layers.batch_normalization(input, fused=True, renorm=True, training=is_training,
reuse=tf.AUTO_REUSE,
name=name)
return out
def _buildmodel(self, minibatch):
out = tf.layers.conv2d(minibatch, filters=96,
kernel_size=[7, 7],
strides=[2, 2],
padding='valid',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv1')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm1', training=self.mode)
out = tf.layers.max_pooling2d(out, pool_size=[3, 3],
strides=[3, 3],
padding='valid',
data_format='channels_first',
name='pool1')
out = tf.layers.conv2d(out, filters=256,
kernel_size=[7, 7],
strides=[1, 1],
padding='valid',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv2')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm2', training=self.mode)
out = tf.layers.max_pooling2d(out, pool_size=[2, 2],
strides=[2, 2],
padding='valid',
data_format='channels_first',
name='pool2')
out = tf.layers.conv2d(out, filters=512,
kernel_size=[3, 3],
strides=[1, 1],
padding='same',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv3')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm3', training=self.mode)
out = tf.layers.conv2d(out, filters=512,
kernel_size=[3, 3],
strides=[1, 1],
padding='same',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv4')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm4', training=self.mode)
out = tf.layers.conv2d(out, filters=1024,
kernel_size=[3, 3],
strides=[1, 1],
padding='same',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv5')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm5', training=self.mode)
out = tf.layers.conv2d(out, filters=1024,
kernel_size=[3, 3],
strides=[1, 1],
padding='same',
data_format='channels_first',
activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='conv6')
out = tf.layers.batch_normalization(out, axis=1, renorm=True, fused=True, name='batchnorm6', training=self.mode)
out = tf.layers.max_pooling2d(out, pool_size=[3, 3],
strides=[3, 3],
padding='valid',
data_format='channels_first',
name='pool3')
out = tf.layers.flatten(out, name='flatten')
out = tf.layers.dense(out, units=4096, activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='full1'
)
out = tf.layers.batch_normalization(out, axis=-1, renorm=True, fused=True, name='batchnorm7', training=self.mode)
out = tf.layers.dense(out, units=4096, activation=tf.nn.relu,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
kernel_regularizer=tf.contrib.layers.l2_regularizer(
scale=0.00001),
reuse=tf.AUTO_REUSE,
name='full2'
)
out = tf.layers.batch_normalization(out, axis=-1, renorm=True, fused=True, name='batchnorm8', training=self.mode)
logits = tf.layers.dense(out, units=self.numclasses,
kernel_initializer=tf.initializers.random_normal(
stddev=0.01,
seed=0),
bias_initializer=tf.initializers.constant(0),
reuse=tf.AUTO_REUSE,
name='output'
)
return logits
为了执行图,我做了如下操作(要查看完整的代码,您可以)
验证
[loss, top1, top5, epoch, summaries_val,
top1_update, top5_update], feed_dict={net.mode: False, netmode: False})
培训
_, loss_value, top1_err, top5_err, eph, summaries, _, _, _ = sess.run(
[update_ops, loss, top1, top5, epoch, summaries_train, train_op,
top1_update,
top5_update], feed_dict={net.mode: True, netmode: True})
在上面的行中,net
是类OverFeatAccurateBase
netmode
是一个占位符,其值决定数据是从培训集还是从验证集读取。我已经下载了您提供的数据集并检查了图像。我发现在验证集中,标签是无序的。所以请检查数据
在label index=1的培训集中,有三个示例: 在标签索引为1的验证集中,有三个示例:
您能发布您使用的数据吗,或者至少发布其中的一些数据吗?我愿意尝试一下,但没有数据就无法运行。@PeterSzoldan您可以在Managed上找到数据的子集以下载并运行它。123个历元后,训练损失为2.83,验证损失为8.214,两者的误差均为1.0:
INFO:tensorflow:TRAIN:Epoch[123],Iter[6027]100次迭代的时间[9.120sec]-损失=2.830,Top1错误=1.00,Top5错误=1.00信息:tensorflow:validation:Epoch[123],Iter[6076]100次迭代的时间[0.976sec]-损失=8.214,Top1错误=1.00,Top5错误=1.00
与您的错误类似吗?什么时候训练损失开始减少?我可以相当自信地排除BN是问题的根源。当我使用训练数据进行验证时,但网络与其他情况相同,它确实给出了正常值(与训练完全相同),因此网络似乎运行正常。但我可以确认,通过验证设置,损失实际上会增加。我已经打印了提供给网络的图像,它们看起来也不错。标签也是。我现在没有什么好主意如何进行。如果我有什么想法,我会回来的。@End-2-End请看