Tensorflow 在多个GPU上分布张量

Tensorflow 在多个GPU上分布张量,tensorflow,tensorflow2.0,distributed-computing,Tensorflow,Tensorflow2.0,Distributed Computing,我试图训练一个模型,其中输入超过系统上单个GPU的内存限制(16 GB P100)。输入的大小为(1256256,64,2)。但是,我可以访问系统上4个相同的GPU。我知道我可以使用tf.distribute分发流程,但我不确定如何在批大小为1的情况下执行此操作。是否可以在多个GPU上分发一个样本,这样我就不会收到OOM错误 编辑: 下面是用于构建模型的代码 def dice_loss(y_true, y_pred): numerator = 2 * tf.reduce_sum(y_tr

我试图训练一个模型,其中输入超过系统上单个GPU的内存限制(16 GB P100)。输入的大小为(1256256,64,2)。但是,我可以访问系统上4个相同的GPU。我知道我可以使用tf.distribute分发流程,但我不确定如何在批大小为1的情况下执行此操作。是否可以在多个GPU上分发一个样本,这样我就不会收到OOM错误

编辑: 下面是用于构建模型的代码

def dice_loss(y_true, y_pred):
    numerator = 2 * tf.reduce_sum(y_true * y_pred, axis=(1,2,3))
    denominator = tf.reduce_sum(y_true + y_pred, axis=(1,2,3))

    return tf.reshape(1 - numerator / denominator, (-1, 1, 1))



class ResidualUnitEncode(keras.layers.Layer):

    def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
        super().__init__(**kwargs)
        self.activation = keras.activations.get(activation)
        self.main_layers = [
            keras.layers.Conv3D(filters, (3, 3, 3), strides=strides,
                                padding="same", use_bias=False),
            keras.layers.BatchNormalization(),
            self.activation,
            keras.layers.Conv3D(filters, (3, 3, 3), strides=1,
                                padding="same", use_bias=False),
            keras.layers.BatchNormalization()]
        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [
                keras.layers.Conv3D(filters, (1, 1, 1), strides=strides,
                                    padding="same", use_bias=False),
                keras.layers.BatchNormalization()]

    def call(self, inputs):
        Z = inputs
        for layer in self.main_layers:
            Z = layer(Z)
        skip_Z = inputs
        for layer in self.skip_layers:
            skip_Z = layer(skip_Z)
        return self.activation(Z + skip_Z)

    def get_config(self):
        base_config = super(ResidualUnitEncode, self).get_config()
        return base_config



class ResidualUnitDecode(keras.layers.Layer):

    def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
        super().__init__(**kwargs)
        self.activation = keras.activations.get(activation)
        self.main_layers = [
            keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=1,
                                         padding="same", use_bias=False),
            keras.layers.BatchNormalization(),
            self.activation,
            keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
                                         padding="same", use_bias=False),
            keras.layers.BatchNormalization()]
        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [
                keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
                                             padding="same", use_bias=False),
                keras.layers.BatchNormalization()]

    def call(self, inputs):
        Z = inputs
        for layer in self.main_layers:
            Z = layer(Z)
        skip_Z = inputs
        for layer in self.skip_layers:
            skip_Z = layer(skip_Z)
        return self.activation(Z + skip_Z)

    def get_config(self):
        base_config = super(ResidualUnitDecode, self).get_config()
        return base_config

def build_unet(image_shape, batch_size):
    inputs = keras.layers.Input(shape=image_shape, batch_size=batch_size)
    conv1 = keras.layers.Conv3D(64, (7, 7, 7), strides=(2, 2, 1), padding="same", use_bias=False, input_shape=image_shape)(inputs)
    conv1 = keras.layers.BatchNormalization()(conv1)
    conv1 = keras.layers.Activation("relu")(conv1)
    pool1 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv1)

    conv2 = ResidualUnitEncode(filters=128, strides=2)(pool1)
    pool2 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv2)

    conv3 = ResidualUnitEncode(filters=256, strides=2)(pool2)
    pool3 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv3)

    conv4 = ResidualUnitEncode(filters=512, strides=2)(pool3)
    pool4 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv4)

    conv5 = ResidualUnitEncode(filters=1024, strides=2)(pool4)
    drop5 = keras.layers.Dropout(0.5)(conv5)

    up6 = ResidualUnitDecode(filters=512, strides=2)(drop5)
    merge6 = keras.layers.concatenate([conv4, up6], axis=4)
    conv6 = ResidualUnitEncode(filters=512, strides=2)(merge6)
    conv6 = keras.layers.UpSampling3D(size=(2,2,2))(conv6)

    up7 = ResidualUnitDecode(filters=256, strides=2)(conv6)
    merge7 = keras.layers.concatenate([conv3, up7], axis=4)
    conv7 = ResidualUnitEncode(filters=256, strides=2)(merge7)
    conv7 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv7)

    up8 = ResidualUnitDecode(filters=128, strides=2)(conv7)
    merge8 = keras.layers.concatenate([conv2, up8], axis=4)
    conv8 = ResidualUnitEncode(filters=128, strides=2)(merge8)
    conv8 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv8)

    up9 = ResidualUnitDecode(filters=64, strides=2)(conv8)
    merge9 = keras.layers.concatenate([conv1, up9], axis=4)
    conv9 = ResidualUnitDecode(filters=64, strides=2)(merge9)
    conv10 = keras.layers.Conv3D(1,1, strides=(1,1,2),activation="sigmoid")(conv9)

    model = keras.Model(inputs, conv10)
    model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=dice_loss)
    model.summary()


    return model
以下是使用Kfold CV运行培训的代码:

image_shape = [256,256,64,2]
dataset = tf.data.TFRecordDataset('train.tfrecord').map(parse_record).batch(69)
nx = tf.compat.v1.data.make_one_shot_iterator(dataset)
x, y = nx.get_next()
x_test = x[55:69, ...]
y_test = y[55:69, ...]
x_train = x[0:54, ...]
y_train = y[0:54, ...]

kfold = KFold(n_splits=10, shuffle=True)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train, test in kfold.split(x_train, y_train):

    model = build_unet(image_shape=image_shape, batch_size=1)

    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss')
    model_file_name = './Fold_' + str(fold_no) + '_best_model.h5'
    model_checkpoint = keras.callbacks.ModelCheckpoint(model_file_name, monitor='val_loss')
    log_dir_name = './Fold_' + str(fold_no) + '_log_dir'
    tb = keras.callbacks.TensorBoard(log_dir_name)

    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    train_id_rows = tf.constant(train.reshape(-1,1))
    test_id_rows = tf.constant(test.reshape(-1,1))
    x_train_train = tf.gather_nd(x_train, train_id_rows)
    y_train_train = tf.gather_nd(y_train, train_id_rows)
    x_train_test = tf.gather_nd(x_train, test_id_rows)
    y_train_test = tf.gather_nd(y_train, test_id_rows)

    history = model.fit(x_train_train, y_train_train, epochs=N_EPOCHS, callbacks=[tb, model_checkpoint, early_stopping], batch_size=1)

    scores = model.evaluate(x_train_test, y_train_test, verbose=0)
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    fold_no = fold_no + 1

数据集中总共有69个样本,54个用于培训/验证循环。

您的输入似乎不大。以float32(4字节)为例,它的大小是256*256*64*2*4/1024/1024=32MB。寻找OOMPlease的另一个源代码共享您用于构建模型的代码。我已经共享了模型和培训的代码。我同意数据看起来不大,但我不确定OOM错误的其他原因。谢谢你的投入似乎不大。以float32(4字节)为例,它的大小是256*256*64*2*4/1024/1024=32MB。寻找OOMPlease的另一个源代码共享您用于构建模型的代码。我已经共享了模型和培训的代码。我同意数据看起来不大,但我不确定OOM错误的其他原因。谢谢