Tensorflow 在多个GPU上分布张量
我试图训练一个模型,其中输入超过系统上单个GPU的内存限制(16 GB P100)。输入的大小为(1256256,64,2)。但是,我可以访问系统上4个相同的GPU。我知道我可以使用tf.distribute分发流程,但我不确定如何在批大小为1的情况下执行此操作。是否可以在多个GPU上分发一个样本,这样我就不会收到OOM错误 编辑: 下面是用于构建模型的代码Tensorflow 在多个GPU上分布张量,tensorflow,tensorflow2.0,distributed-computing,Tensorflow,Tensorflow2.0,Distributed Computing,我试图训练一个模型,其中输入超过系统上单个GPU的内存限制(16 GB P100)。输入的大小为(1256256,64,2)。但是,我可以访问系统上4个相同的GPU。我知道我可以使用tf.distribute分发流程,但我不确定如何在批大小为1的情况下执行此操作。是否可以在多个GPU上分发一个样本,这样我就不会收到OOM错误 编辑: 下面是用于构建模型的代码 def dice_loss(y_true, y_pred): numerator = 2 * tf.reduce_sum(y_tr
def dice_loss(y_true, y_pred):
numerator = 2 * tf.reduce_sum(y_true * y_pred, axis=(1,2,3))
denominator = tf.reduce_sum(y_true + y_pred, axis=(1,2,3))
return tf.reshape(1 - numerator / denominator, (-1, 1, 1))
class ResidualUnitEncode(keras.layers.Layer):
def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
super().__init__(**kwargs)
self.activation = keras.activations.get(activation)
self.main_layers = [
keras.layers.Conv3D(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization(),
self.activation,
keras.layers.Conv3D(filters, (3, 3, 3), strides=1,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
self.skip_layers = []
if strides > 1:
self.skip_layers = [
keras.layers.Conv3D(filters, (1, 1, 1), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
def call(self, inputs):
Z = inputs
for layer in self.main_layers:
Z = layer(Z)
skip_Z = inputs
for layer in self.skip_layers:
skip_Z = layer(skip_Z)
return self.activation(Z + skip_Z)
def get_config(self):
base_config = super(ResidualUnitEncode, self).get_config()
return base_config
class ResidualUnitDecode(keras.layers.Layer):
def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
super().__init__(**kwargs)
self.activation = keras.activations.get(activation)
self.main_layers = [
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=1,
padding="same", use_bias=False),
keras.layers.BatchNormalization(),
self.activation,
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
self.skip_layers = []
if strides > 1:
self.skip_layers = [
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
def call(self, inputs):
Z = inputs
for layer in self.main_layers:
Z = layer(Z)
skip_Z = inputs
for layer in self.skip_layers:
skip_Z = layer(skip_Z)
return self.activation(Z + skip_Z)
def get_config(self):
base_config = super(ResidualUnitDecode, self).get_config()
return base_config
def build_unet(image_shape, batch_size):
inputs = keras.layers.Input(shape=image_shape, batch_size=batch_size)
conv1 = keras.layers.Conv3D(64, (7, 7, 7), strides=(2, 2, 1), padding="same", use_bias=False, input_shape=image_shape)(inputs)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.Activation("relu")(conv1)
pool1 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv1)
conv2 = ResidualUnitEncode(filters=128, strides=2)(pool1)
pool2 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv2)
conv3 = ResidualUnitEncode(filters=256, strides=2)(pool2)
pool3 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv3)
conv4 = ResidualUnitEncode(filters=512, strides=2)(pool3)
pool4 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv4)
conv5 = ResidualUnitEncode(filters=1024, strides=2)(pool4)
drop5 = keras.layers.Dropout(0.5)(conv5)
up6 = ResidualUnitDecode(filters=512, strides=2)(drop5)
merge6 = keras.layers.concatenate([conv4, up6], axis=4)
conv6 = ResidualUnitEncode(filters=512, strides=2)(merge6)
conv6 = keras.layers.UpSampling3D(size=(2,2,2))(conv6)
up7 = ResidualUnitDecode(filters=256, strides=2)(conv6)
merge7 = keras.layers.concatenate([conv3, up7], axis=4)
conv7 = ResidualUnitEncode(filters=256, strides=2)(merge7)
conv7 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv7)
up8 = ResidualUnitDecode(filters=128, strides=2)(conv7)
merge8 = keras.layers.concatenate([conv2, up8], axis=4)
conv8 = ResidualUnitEncode(filters=128, strides=2)(merge8)
conv8 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv8)
up9 = ResidualUnitDecode(filters=64, strides=2)(conv8)
merge9 = keras.layers.concatenate([conv1, up9], axis=4)
conv9 = ResidualUnitDecode(filters=64, strides=2)(merge9)
conv10 = keras.layers.Conv3D(1,1, strides=(1,1,2),activation="sigmoid")(conv9)
model = keras.Model(inputs, conv10)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=dice_loss)
model.summary()
return model
以下是使用Kfold CV运行培训的代码:
image_shape = [256,256,64,2]
dataset = tf.data.TFRecordDataset('train.tfrecord').map(parse_record).batch(69)
nx = tf.compat.v1.data.make_one_shot_iterator(dataset)
x, y = nx.get_next()
x_test = x[55:69, ...]
y_test = y[55:69, ...]
x_train = x[0:54, ...]
y_train = y[0:54, ...]
kfold = KFold(n_splits=10, shuffle=True)
fold_no = 1
acc_per_fold = []
loss_per_fold = []
for train, test in kfold.split(x_train, y_train):
model = build_unet(image_shape=image_shape, batch_size=1)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss')
model_file_name = './Fold_' + str(fold_no) + '_best_model.h5'
model_checkpoint = keras.callbacks.ModelCheckpoint(model_file_name, monitor='val_loss')
log_dir_name = './Fold_' + str(fold_no) + '_log_dir'
tb = keras.callbacks.TensorBoard(log_dir_name)
print('------------------------------------------------------------------------')
print(f'Training for fold {fold_no} ...')
train_id_rows = tf.constant(train.reshape(-1,1))
test_id_rows = tf.constant(test.reshape(-1,1))
x_train_train = tf.gather_nd(x_train, train_id_rows)
y_train_train = tf.gather_nd(y_train, train_id_rows)
x_train_test = tf.gather_nd(x_train, test_id_rows)
y_train_test = tf.gather_nd(y_train, test_id_rows)
history = model.fit(x_train_train, y_train_train, epochs=N_EPOCHS, callbacks=[tb, model_checkpoint, early_stopping], batch_size=1)
scores = model.evaluate(x_train_test, y_train_test, verbose=0)
acc_per_fold.append(scores[1] * 100)
loss_per_fold.append(scores[0])
fold_no = fold_no + 1
数据集中总共有69个样本,54个用于培训/验证循环。您的输入似乎不大。以float32(4字节)为例,它的大小是256*256*64*2*4/1024/1024=32MB。寻找OOMPlease的另一个源代码共享您用于构建模型的代码。我已经共享了模型和培训的代码。我同意数据看起来不大,但我不确定OOM错误的其他原因。谢谢你的投入似乎不大。以float32(4字节)为例,它的大小是256*256*64*2*4/1024/1024=32MB。寻找OOMPlease的另一个源代码共享您用于构建模型的代码。我已经共享了模型和培训的代码。我同意数据看起来不大,但我不确定OOM错误的其他原因。谢谢