Tensorflow T5编码器型号输出全零?

Tensorflow T5编码器型号输出全零?,tensorflow,machine-learning,keras,huggingface-transformers,Tensorflow,Machine Learning,Keras,Huggingface Transformers,我正在尝试一个项目,在这个项目中,我使用来自HuggingFace的T5EncoderModel来获得输入句子的隐藏表示。我有10万个句子,我将其标记和填充如下: for sentence in dataset[original]: sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True) original_

我正在尝试一个项目,在这个项目中,我使用来自HuggingFace的T5EncoderModel来获得输入句子的隐藏表示。我有10万个句子,我将其标记和填充如下:

 for sentence in dataset[original]:
        sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
        original_sentences.append(sentence.input_ids)
        org_mask.append(sentence.attention_mask)

def create_encoder(rep_dim):
    encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
    encoder.trainable = True
     
    original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
    augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
 


    concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])

    #Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
    encoded = encoder(input_ids=concat)[0]
    
    #This outputs shape: [sentences, max_length, encoded_dims]
    
    
    output = Dense(rep_dim, activation='relu')(encoded)
    
    return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
这给了我正确的输出,并恰当地标记了一切。我遇到的问题是当我试图实际训练模型时。设置有点复杂,我正试图从中应用到文本

培训安排如下:

 for sentence in dataset[original]:
        sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
        original_sentences.append(sentence.input_ids)
        org_mask.append(sentence.attention_mask)

def create_encoder(rep_dim):
    encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
    encoder.trainable = True
     
    original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
    augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
 


    concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])

    #Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
    encoded = encoder(input_ids=concat)[0]
    
    #This outputs shape: [sentences, max_length, encoded_dims]
    
    
    output = Dense(rep_dim, activation='relu')(encoded)
    
    return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
此函数通过上述链接输入ReprensentationLearner类,如下所示:

class RepresentationLearner(keras.Model):
    def __init__(
        self,
        encoder,
        projection_units,
        temperature=0.8,
        dropout_rate=0.1,
        l2_normalize=False,
        **kwargs
    ):
        super(RepresentationLearner, self).__init__(**kwargs)
        self.encoder = encoder
        # Create projection head.
        self.projector = keras.Sequential(
            [
                layers.Dropout(dropout_rate),
                layers.Dense(units=projection_units, use_bias=False),
                layers.BatchNormalization(),
                layers.ReLU(),
            ]
        )
        self.temperature = temperature
        self.l2_normalize = l2_normalize
        self.loss_tracker = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def compute_contrastive_loss(self, feature_vectors, batch_size):
        num_augmentations = tf.shape(feature_vectors)[0] // batch_size
        if self.l2_normalize:
            feature_vectors = tf.math.l2_normalize(feature_vectors, -1)
        # The logits shape is [num_augmentations * batch_size, num_augmentations * batch_size].
        logits = (
            tf.linalg.matmul(feature_vectors, feature_vectors, transpose_b=True)
            / self.temperature
        )
        # Apply log-max trick for numerical stability.
        logits_max = tf.math.reduce_max(logits, axis=1)
        logits = logits - logits_max
        # The shape of targets is [num_augmentations * batch_size, num_augmentations * batch_size].
        # targets is a matrix consits of num_augmentations submatrices of shape [batch_size * batch_size].
        # Each [batch_size * batch_size] submatrix is an identity matrix (diagonal entries are ones).
        targets = tf.tile(tf.eye(batch_size), [num_augmentations, num_augmentations])
        # Compute cross entropy loss
        return keras.losses.categorical_crossentropy(
            y_true=targets, y_pred=logits, from_logits=True
        )

    def call(self, inputs):
        features = self.encoder(inputs[0])[0]
        # Apply projection head.
        return self.projector(features[0])

    def train_step(self, inputs):
        batch_size = tf.shape(inputs)[0]
        # Run the forward pass and compute the contrastive loss
        with tf.GradientTape() as tape:
            feature_vectors = self(inputs, training=True)
            loss = self.compute_contrastive_loss(feature_vectors, batch_size)
        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update loss tracker metric
        self.loss_tracker.update_state(loss)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, inputs):
        batch_size = tf.shape(inputs)[0]
        feature_vectors = self(inputs, training=False)
        loss = self.compute_contrastive_loss(feature_vectors, batch_size)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

为了对其进行培训,我使用Colab TPU并对其进行如下培训:

with strategy.scope():
  encoder = create_encoder(rep_dim)
  training_model = RepresentationLearner(encoder=encoder, projection_units=128, temperature=0.1)


  lr_scheduler = keras.experimental.CosineDecay(initial_learning_rate=0.001, decay_steps=500, alpha=0.1)
  training_model.compile(optimizer=tfa.optimizers.AdamW(learning_rate=lr_scheduler, weight_decay=0.0001))
  history = training_model.fit(x = [original_train, augmented_train], batch_size=32*8, epocs = 10)
  
  training_model.save_weights('representation_learner.h5', overwrite=True)

请注意,我给了我的模型两个输入。当我对输入数据进行预测时,我得到了所有的零,我似乎无法理解为什么。我预测如下:

 for sentence in dataset[original]:
        sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
        original_sentences.append(sentence.input_ids)
        org_mask.append(sentence.attention_mask)

def create_encoder(rep_dim):
    encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
    encoder.trainable = True
     
    original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
    augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
 


    concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])

    #Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
    encoded = encoder(input_ids=concat)[0]
    
    #This outputs shape: [sentences, max_length, encoded_dims]
    
    
    output = Dense(rep_dim, activation='relu')(encoded)
    
    return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
输出为:

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)


如果形状太大(1000000,128)

请告诉我是否需要更多细节!:)你的模特训练成功了吗?预期?@M.Innat是的,模型成功训练,损失函数在5个时期内从10000增加到145。