Keras Tensorflow 2.0相同的模型结构和超参数导致不同调用方法的性能不同

Keras Tensorflow 2.0相同的模型结构和超参数导致不同调用方法的性能不同,keras,deep-learning,tensorflow2.0,Keras,Deep Learning,Tensorflow2.0,在那里。我是一名初学者,正在学习Tensorflow 2.0。我有一个模型叫做3种不同的方法。表演也不一样。谁能告诉我为什么会这样 模型构造和调用方法: import os, sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import tensorflow as tf from tensorflow import keras from tensorflow.keras import dat

在那里。我是一名初学者,正在学习Tensorflow 2.0。我有一个模型叫做3种不同的方法。表演也不一样。谁能告诉我为什么会这样

模型构造和调用方法:

import os, sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, models, layers, regularizers, optimizers


def prepare_mnist_features_and_labels(x, y):
    x = tf.cast(x, tf.float32)/255.0
    y = tf.cast(y, tf.int64)
    return x, y


def mninst_dataset():
    (x_train, y_train), (x_eval, y_eval) = datasets.mnist.load_data()
    print('x_train/y_train shape:', x_train.shape, y_train.shape)
    y_train = tf.one_hot(y_train, depth=10)
    y_eval = tf.one_hot(y_eval, depth=10)

    ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    ds_train = ds_train.map(prepare_mnist_features_and_labels)
    ds_train = ds_train.shuffle(x_train.shape[0]).batch(128)

    ds_eval = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
    ds_eval = ds_eval.map(prepare_mnist_features_and_labels)
    ds_eval = ds_eval.shuffle(x_eval.shape[0]).batch(128)

    sample = next(iter(ds_train))
    print('sample: ', sample[0].shape, sample[1].shape)

    return ds_train, ds_eval


def main():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    trainset, evalset = mninst_dataset()

    model = keras.Sequential()
    model.add(layers.Reshape(target_shape=[28, 28, 1], input_shape=[28, 28]))
    model.add(layers.Conv2D(filters=32, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME"))
    model.add(layers.MaxPool2D(pool_size=(2,2), strides=[1,1], padding="SAME"))
    model.add(layers.Conv2D(filters=64, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME"))
    model.add(layers.MaxPool2D(pool_size=(2,2), strides=[2,2], padding="SAME"))
    model.add(layers.Flatten())
    model.add(layers.Dense(units=512, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01)))
    model.add(layers.Dense(units=10, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01)))

    model.compile(optimizer=optimizers.Adam(lr=0.01), loss=tf.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
    model.fit(trainset.repeat(), epochs=30, steps_per_epoch=500,
              validation_data=evalset.repeat(), validation_steps=10)

if __name__=='__main__':
    main()
构建模型并运行模型的第二种方法如下:

    import os, sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, models, layers, regularizers, optimizers

from tqdm import tqdm

def prepare_mnist_features_and_labels(x, y):
    x = tf.cast(x, tf.float32)/255.0
    y = tf.cast(y, tf.int64)
    return x, y


def mnist_dataset():
    (x_train, y_train), (x_eval, y_eval) = datasets.mnist.load_data()
     # y_train = tf.one_hot(y_train, depth=10)
    # y_eval = tf.one_hot(y_eval, depth=10)

    ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    ds_train = ds_train.map(prepare_mnist_features_and_labels)
    # Test: replace x_train.shape[0] by the number of the training samples, which is 60000
    ds_train = ds_train.shuffle(x_train.shape[0]).batch(128)

    ds_eval = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
    ds_eval = ds_eval.map(prepare_mnist_features_and_labels)
    ds_eval = ds_eval.shuffle(x_eval.shape[0]).batch(128)

    # sample = next(iter(ds_train))
    # print('sample: ', sample[0].shape, sample[1].shape)

    return ds_train, ds_eval

# tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name=None):
# labels: Tensof of shape [d_0, d_1, ..., d_{r-1}]. Each label must be an index in [0, num_classes]
# logits: Unscaled of log probabilities of shape [d_0, d_1, ..., d_{r-1}, num_classes]
# A common use is to have logits of shape [batch_size, num_classes] and have labels of shape [batch_size]
def compute_loss(logits, labels):
    # print(logits.numpy())
    # print(labels.numpy())
    return tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=labels, logits=logits
                )
            )


def compute_accuracy(logits, labels):
    predictions = tf.argmax(logits, axis=1)
    # print(predictions)
    # print(labels)
    # print(list(zip(predictions.numpy(), labels.numpy())))
    return tf.reduce_mean(tf.cast(tf.equal(predictions, labels), tf.float32))


def train_one_step(model, optimizer, x, y):
    # At each train step, first calculate the forward loss
    with tf.GradientTape() as tape:
        logits = model(x)
        loss = compute_loss(logits, y)

    # Then calculate the backward gradients over each trainable variables
    grads = tape.gradient(loss, model.trainable_variables)
    # Optimize and update variables throught backpropagation
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Compute current model accuracy
    accuracy = compute_accuracy(logits, y)

    return loss, accuracy


def train(epoch, model, optimizer, trainset):
#def train(epoch, model, optimizer):
    # trainset = mnist_dataset()[0]
    loss = 0.0
    accuracy = 0.0

    #for step, (x, y) in enumerate(tqdm(trainset)):
    for step, (x, y) in enumerate(tqdm(trainset)):
        loss, accuracy = train_one_step(model, optimizer, x, y)

        if step % 110 == 0:
            print('epoch', epoch, ': loss', loss.numpy(), '; accuracy', accuracy.numpy())

    return loss, accuracy


class MyModel(keras.Model):


    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = layers.Conv2D(filters=32, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME", input_shape=(-1, 28, 28, 1))
        self.layer2 = layers.MaxPool2D(pool_size=(2,2), strides=[1,1], padding="SAME")
        self.layer3 = layers.Conv2D(filters=64, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME")
        self.layer4 = layers.MaxPool2D(pool_size=(2,2), strides=[2,2], padding="SAME")
        self.layer5 = layers.Flatten()
        self.layer6 = layers.Dense(units=512, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))
        self.layer7 = layers.Dense(units=10, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))


    def call(self, x, training=False):
        x = tf.reshape(x, (-1, 28, 28, 1))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        return x


def main():

    # set random seed
    tf.random.set_seed(22)    

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    trainset, evalset = mnist_dataset()

    model = MyModel()

    optimizer = optimizers.Adam(lr=0.001)

    # Save checkpoints with keras api as the first approach
    # Save checkpoints manually as a second approach.
    # find a way to implement early-stopping strategy in the programming style

    # for epoch in tqdm(range(30)):
    for epoch in range(50):
        loss, accuracy = train(epoch, model, optimizer, trainset)

    print('Final epoch', epoch, ': loss', loss.numpy(), '; accuracy', accuracy.numpy())


if __name__ == '__main__':
    main()
最后一种方法如下:

import os, sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, models, layers, regularizers, optimizers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


def prepare_mnist_features_and_labels(x, y):
    x = tf.cast(x, tf.float32)/255.0
    y = tf.cast(y, tf.int64)
    return x, y


def mnist_dataset():
    (x_train, y_train), (x_eval, y_eval) = datasets.mnist.load_data()
    print('x_train/y_train shape:', x_train.shape, y_train.shape)
    y_train = tf.one_hot(y_train, depth=10)
    y_eval = tf.one_hot(y_eval, depth=10)

    ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    ds_train = ds_train.map(prepare_mnist_features_and_labels)
    ds_train = ds_train.shuffle(x_train.shape[0]).batch(128)

    ds_eval = tf.data.Dataset.from_tensor_slices((x_eval, y_eval))
    ds_eval = ds_eval.map(prepare_mnist_features_and_labels)
    ds_eval = ds_eval.shuffle(x_eval.shape[0]).batch(128)

    sample = next(iter(ds_train))
    print('sample: ', sample[0].shape, sample[1].shape)

    return ds_train, ds_eval


class MyModel(keras.Model):

    # self.model = keras.Sequential([
    #     layers.Reshape(target_shape=(28*28, ), input_shape=(28, 28)),
    #     layers.Dense(100, activation=tf.nn.relu),
    #     layers.Dense(100, activation=tf.nn.relu),
    #     layers.Desnse(10)
    # ])
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = layers.Conv2D(filters=32, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME", input_shape=(-1, 28, 28, 1))
        self.layer2 = layers.MaxPool2D(pool_size=(2,2), strides=[1,1], padding="SAME")
        self.layer3 = layers.Conv2D(filters=64, kernel_size=(5, 5), activation=tf.nn.relu, strides=[1,1], padding="SAME")
        self.layer4 = layers.MaxPool2D(pool_size=(2,2), strides=[2,2], padding="SAME")
        self.layer5 = layers.Flatten()
        self.layer6 = layers.Dense(units=512, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))
        self.layer7 = layers.Dense(units=10, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))


    def call(self, x, training=False):
        x = tf.reshape(x, (-1, 28, 28, 1))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        return x


def main():
    tf.random.set_seed(22)

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    trainset, evalset = mnist_dataset()

    model = MyModel()
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss=tf.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
    model.fit(trainset.repeat(), epochs=30, steps_per_epoch=500, verbose=1,
              validation_data=evalset.repeat(), validation_steps=10)


if __name__ == '__main__':
    main()
他们每个人都需要一段时间来训练。有人能告诉我为什么表演不同吗?将来我该如何自己调试呢


非常感谢您的帮助。

仔细检查网络后,问题就解决了。结果表明,模型中最后一个完全连接的层是通过relu功能激活的,这在实际中并不合适。损失函数tf.loss.categoricalCrossentropy和tf.nn.sparse\u softmax\u cross\u entropy\u与logits的选择也有很大的不同。无论选择什么,确保损耗函数与网络的最终输出匹配