Tensorflow 尽管使用tf数据管道,但训练速度较慢

Tensorflow 尽管使用tf数据管道,但训练速度较慢,tensorflow,tensorflow2.0,tensorflow-datasets,tf.dataset,Tensorflow,Tensorflow2.0,Tensorflow Datasets,Tf.dataset,我正在训练包含21000张图像的图像分类模型。我在tensorflow的tf.data API的帮助下创建了数据管道。我的问题是,尽管使用了API,但培训速度太慢。我还启用了tensorflow gpu版本。请帮帮我。我原以为这是因为keras imagedatagenerator使我的训练时间变慢了,但现在当我把它改成tf.data pipeline时,它仍然没有使用我的gpu。下面是我的全部代码 import numpy as np import pandas as pd import ma

我正在训练包含21000张图像的图像分类模型。我在tensorflow的tf.data API的帮助下创建了数据管道。我的问题是,尽管使用了API,但培训速度太慢。我还启用了tensorflow gpu版本。请帮帮我。我原以为这是因为keras imagedatagenerator使我的训练时间变慢了,但现在当我把它改成tf.data pipeline时,它仍然没有使用我的gpu。下面是我的全部代码

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50, EfficientNetB3, InceptionV3, DenseNet121
from tensorflow.keras.optimizers import Adam

# ignoring warnings
import warnings
warnings.simplefilter("ignore")
import os,cv2


base_dir = "D:/cassava-leaf-disease-classification/"
train_csv = pd.read_csv("D:/cassava-leaf-disease-classification/train.csv")
# print(train_csv.head())
df_sample = pd.read_csv("D:/cassava-leaf-disease-classification/sample_submission.csv")
train_images = "D:/cassava-leaf-disease-classification/train_images/"+train_csv['image_id']
# print(train_images)

# print(os.listdir(train_images))
train_labels = pd.read_csv(os.path.join(base_dir, "train.csv"))

# print(train_labels)
BATCH_SIZE = 16
EPOCHS = 25
STEPS_PER_EPOCH = len(train_labels)*0.8 / BATCH_SIZE
TARGET_SIZE = 300
# train_labels['label'] = train_labels.label.astype('str')
labels = train_labels.iloc[:,-1].values
# print(labels)

def build_decoder(with_labels=True, target_size=(TARGET_SIZE, TARGET_SIZE), ext='jpg'):
    def img_decode(img_path):
        file_bytes = tf.io.read_file(img_path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)

        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img

    def decode_with_labels(img_path, label):
        return img_decode(img_path), label

    if with_labels == True:
        return decode_with_labels

    else:
        return img_decode


def build_augmenter(with_labels=True):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.random_brightness(img, 0.1)
        img = tf.image.random_contrast(img, 0.9, 1.1)
        img = tf.image.random_saturation(img, 0.9, 1.1)
        return img

    def augment_with_labels(img, label):
        return augment(img), label

    if with_labels == True:
        return augment_with_labels

    else:
        return augment


def build_dataset(paths, labels=None, bsize=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=True, repeat=True, shuffle=1024,
                  cache_dir=""):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)

    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)

    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)

    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)

    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    # dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)

    return dset

# Train test split
(train_img, valid_img,train_labels,valid_labels) = train_test_split(train_images,labels,train_size = 0.8,random_state = 0)
# print(train, valid)


# Tensorflow datasets
train_df = build_dataset(
    train_img, train_labels, bsize=BATCH_SIZE,
    cache=True)

valid_df = build_dataset(
    valid_img, valid_labels, bsize=BATCH_SIZE,
    repeat=False, shuffle=False, augment=False,
    cache=True)


def create_model():
    model = models.Sequential()
    model.add(EfficientNetB3(include_top=False, weights='imagenet',
                             input_shape=(TARGET_SIZE,TARGET_SIZE,3)))
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dense(5,activation='softmax'))
    model.compile(optimizer=Adam(lr=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

model = create_model()
model.summary()

model_save = ModelCheckpoint('C:/Users/rosha/PycharmProjects/CLDD/saved_Models/EffNetB3_300_16_best_weights.h5',
                             save_best_only=True,
                             save_weights_only=True,
                             monitor='val_accuracy',
                             mode='max',
                             verbose=1
                             )

early_stop = EarlyStopping(monitor='val_accuracy',
                           min_delta=0.001,
                           patience=5,
                           mode='max',
                           verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy',
                            factor=0.3,
                            patience=2,
                            min_delta=0.001,
                            mode='max',
                            verbose=1)

history = model.fit(
    train_df,
    validation_data=valid_df,
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=EPOCHS,
    callbacks=[model_save, early_stop, reduce_lr],
    verbose=1,
)


plt.rcParams.update({'font.size': 16})
hist = pd.DataFrame(history.history)
fig, (ax1, ax2) = plt.subplots(figsize=(12, 12), nrows=2, ncols=1)
hist['loss'].plot(ax=ax1, c='k', label='training loss')
hist['val_loss'].plot(ax=ax1, c='r', linestyle='--', label='validation loss')
ax1.legend()
hist['accuracy'].plot(ax=ax2, c='k', label='training accuracy')
hist['val_accuracy'].plot(ax=ax2, c='r', linestyle='--', label='validation accuracy')
ax2.legend()
plt.show()

model.save('./EffNetB3_300_16.h5')

下面是一个小清单,我想看一下:

  • 执行以下代码以检查tensorflow是否找到GPU:
  • 如果输出为“Num GPUs Available:0”,那么您应该检查是否确实安装了tensorflow gpu,您可能还需要检查gpu版本中是否也有支持库

  • 如果库正确,则需要检查CUDA驱动程序安装是否正确。这一步有点依赖于操作系统,但在线上有很多这两个步骤的教程。我最喜欢的TF可以在官方网站上找到:

  • import tensorflow as tf
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))