Python 如何在多gpu自定义GAN中停止此错误?

Python 如何在多gpu自定义GAN中停止此错误?,python,tensorflow,keras,subclassing,generative-adversarial-network,Python,Tensorflow,Keras,Subclassing,Generative Adversarial Network,我最近升级了我的装备,并在1080 ti的基础上增加了1070 ti,以加快训练速度。由于这一点,对于普通车型,我能够更快地进行训练。相反,我试图训练一个在单个GPU上工作的GAN算法,但我无法让它在多GPU设置上工作。 我覆盖tf.keras.Model并使用自定义的train_步骤以及一些其他功能。就我的一生而言,如果不出现以下错误,我无法使其正常运行: tensorflow.python.framework.errors_impl.InvalidArgumentError: 3 root

我最近升级了我的装备,并在1080 ti的基础上增加了1070 ti,以加快训练速度。由于这一点,对于普通车型,我能够更快地进行训练。相反,我试图训练一个在单个GPU上工作的GAN算法,但我无法让它在多GPU设置上工作。 我覆盖tf.keras.Model并使用自定义的train_步骤以及一些其他功能。就我的一生而言,如果不出现以下错误,我无法使其正常运行:

tensorflow.python.framework.errors_impl.InvalidArgumentError: 3 root error(s) found.
  (0) Invalid argument:  Incompatible shapes: [8] vs. [16]
         [[node add (defined at Users\<User>\OneDrive\Documenten\HKU\Year 4\PDP_and_SN\Supportive Narrative\Research\Alpha_2\lib\NN.py:120) ]]
         [[replica_1/sequential_1/batch_normalization_10/Greater/_96]]
  (1) Invalid argument:  Incompatible shapes: [8] vs. [16]
         [[node add (defined at Users\<User>\OneDrive\Documenten\HKU\Year 4\PDP_and_SN\Supportive Narrative\Research\Alpha_2\lib\NN.py:120) ]]
         [[Adam_1/AddN/_140]]
  (2) Invalid argument:  Incompatible shapes: [8] vs. [16]
         [[node add (defined at Users\<User>\OneDrive\Documenten\HKU\Year 4\PDP_and_SN\Supportive Narrative\Research\Alpha_2\lib\NN.py:120) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_18178]
代码比这扩展得更多,但这里的某个地方应该是问题的本质所在。
谢谢

明白了!因为对于每个gpu,它会在其上划分批大小(因此在我的例子中,两个gpu,它会将批大小划分为两个)。对于模型,我必须将输入批大小划分为两个,但对于创建数据集,我可以保持批大小相同

class GAN_Model(tf.keras.Model):
    def __init__(self, generator, discriminator, latent_dim, batch_size):
        super(GAN_Model, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.batch_size = batch_size
    
    def compile(self, discriminator_optimizer, generator_optimizer, loss_function):
        super(GAN_Model, self).compile()
        self.discriminator_optimizer = discriminator_optimizer
        self.generator_optimizer = generator_optimizer
        self.loss_function = loss_function

    def generator_loss(self, cross_entropy,fake_output):
        return cross_entropy(tf.ones_like(fake_output), fake_output)

    def discriminator_loss(self, cross_entropy, real_output, fake_output):
        real_loss = cross_entropy(tf.ones_like(real_output), real_output)
        fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
        total_loss = real_loss + fake_loss
        return total_loss

    def train_step(self, real_audio):
        random_latent_vectors = tf.random.normal(shape=(self.batch_size, self.latent_dim))

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_images = self.generator(random_latent_vectors, training = True)
            real_output = self.discriminator(real_audio[0], training = True)
            fake_output = self.discriminator(generated_images, training = True)

            g_loss = self.generator_loss(self.loss_function, fake_output)
            d_loss = self.discriminator_loss(self.loss_function, real_output, fake_output)
        
        gradients_of_generator = gen_tape.gradient(g_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(d_loss, self.discriminator.trainable_variables)

        self.generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))
        return {"d_loss": d_loss, "g_loss": g_loss, "prediction": generated_images}

mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"],cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

with mirrored_strategy.scope():
    generator = make_generator(latent_dim)
    discriminator = make_discriminator(spectral_size)
    g_opt = tf.keras.optimizers.Adam(0.0001,beta_1=0.5)
    d_opt = tf.keras.optimizers.Adam(0.00012,beta_1=0.5)
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.NONE)
    gan = GAN_Model(generator,discriminator,latent_dim,batch_size)
    gan.compile(
        d_opt,
        g_opt,
        loss_fn,
    )
    ckpt = tf.train.Checkpoint(generator_optimizer=g_opt,
                                discriminator_optimizer=d_opt,
                                generator=generator,
                                disciminator=discriminator)
    manager = tf.train.CheckpointManager(ckpt, ".\\data\\checkpoints\\" + str(model_name), max_to_keep=15)

    if restore_model:
        ckpt.restore(manager.latest_checkpoint)
        
dataset = tf.data.experimental.load(dataset_dir,(tf.TensorSpec(shape=(spectral_size[0],spectral_size[1],spectral_size[2]), dtype=tf.double), tf.TensorSpec(shape=(2), dtype=tf.double)),compression="GZIP").batch(batch_size)
print(dataset)

history = gan.fit(dataset, epochs=epochs, callbacks=[generate_and_save_audio(manager,model_name)])