Tensorflow T5编码器型号输出全零?
我正在尝试一个项目,在这个项目中,我使用来自HuggingFace的T5EncoderModel来获得输入句子的隐藏表示。我有10万个句子,我将其标记和填充如下:Tensorflow T5编码器型号输出全零?,tensorflow,machine-learning,keras,huggingface-transformers,Tensorflow,Machine Learning,Keras,Huggingface Transformers,我正在尝试一个项目,在这个项目中,我使用来自HuggingFace的T5EncoderModel来获得输入句子的隐藏表示。我有10万个句子,我将其标记和填充如下: for sentence in dataset[original]: sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True) original_
for sentence in dataset[original]:
sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
original_sentences.append(sentence.input_ids)
org_mask.append(sentence.attention_mask)
def create_encoder(rep_dim):
encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
encoder.trainable = True
original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])
#Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
encoded = encoder(input_ids=concat)[0]
#This outputs shape: [sentences, max_length, encoded_dims]
output = Dense(rep_dim, activation='relu')(encoded)
return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
这给了我正确的输出,并恰当地标记了一切。我遇到的问题是当我试图实际训练模型时。设置有点复杂,我正试图从中应用到文本
培训安排如下:
for sentence in dataset[original]:
sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
original_sentences.append(sentence.input_ids)
org_mask.append(sentence.attention_mask)
def create_encoder(rep_dim):
encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
encoder.trainable = True
original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])
#Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
encoded = encoder(input_ids=concat)[0]
#This outputs shape: [sentences, max_length, encoded_dims]
output = Dense(rep_dim, activation='relu')(encoded)
return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
此函数通过上述链接输入ReprensentationLearner类,如下所示:
class RepresentationLearner(keras.Model):
def __init__(
self,
encoder,
projection_units,
temperature=0.8,
dropout_rate=0.1,
l2_normalize=False,
**kwargs
):
super(RepresentationLearner, self).__init__(**kwargs)
self.encoder = encoder
# Create projection head.
self.projector = keras.Sequential(
[
layers.Dropout(dropout_rate),
layers.Dense(units=projection_units, use_bias=False),
layers.BatchNormalization(),
layers.ReLU(),
]
)
self.temperature = temperature
self.l2_normalize = l2_normalize
self.loss_tracker = keras.metrics.Mean(name="loss")
@property
def metrics(self):
return [self.loss_tracker]
def compute_contrastive_loss(self, feature_vectors, batch_size):
num_augmentations = tf.shape(feature_vectors)[0] // batch_size
if self.l2_normalize:
feature_vectors = tf.math.l2_normalize(feature_vectors, -1)
# The logits shape is [num_augmentations * batch_size, num_augmentations * batch_size].
logits = (
tf.linalg.matmul(feature_vectors, feature_vectors, transpose_b=True)
/ self.temperature
)
# Apply log-max trick for numerical stability.
logits_max = tf.math.reduce_max(logits, axis=1)
logits = logits - logits_max
# The shape of targets is [num_augmentations * batch_size, num_augmentations * batch_size].
# targets is a matrix consits of num_augmentations submatrices of shape [batch_size * batch_size].
# Each [batch_size * batch_size] submatrix is an identity matrix (diagonal entries are ones).
targets = tf.tile(tf.eye(batch_size), [num_augmentations, num_augmentations])
# Compute cross entropy loss
return keras.losses.categorical_crossentropy(
y_true=targets, y_pred=logits, from_logits=True
)
def call(self, inputs):
features = self.encoder(inputs[0])[0]
# Apply projection head.
return self.projector(features[0])
def train_step(self, inputs):
batch_size = tf.shape(inputs)[0]
# Run the forward pass and compute the contrastive loss
with tf.GradientTape() as tape:
feature_vectors = self(inputs, training=True)
loss = self.compute_contrastive_loss(feature_vectors, batch_size)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update loss tracker metric
self.loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
def test_step(self, inputs):
batch_size = tf.shape(inputs)[0]
feature_vectors = self(inputs, training=False)
loss = self.compute_contrastive_loss(feature_vectors, batch_size)
self.loss_tracker.update_state(loss)
return {"loss": self.loss_tracker.result()}
为了对其进行培训,我使用Colab TPU并对其进行如下培训:
with strategy.scope():
encoder = create_encoder(rep_dim)
training_model = RepresentationLearner(encoder=encoder, projection_units=128, temperature=0.1)
lr_scheduler = keras.experimental.CosineDecay(initial_learning_rate=0.001, decay_steps=500, alpha=0.1)
training_model.compile(optimizer=tfa.optimizers.AdamW(learning_rate=lr_scheduler, weight_decay=0.0001))
history = training_model.fit(x = [original_train, augmented_train], batch_size=32*8, epocs = 10)
training_model.save_weights('representation_learner.h5', overwrite=True)
请注意,我给了我的模型两个输入。当我对输入数据进行预测时,我得到了所有的零,我似乎无法理解为什么。我预测如下:
for sentence in dataset[original]:
sentence = tokenizer(sentence, max_length=40, padding='max_length', return_tensors='tf', truncation= True)
original_sentences.append(sentence.input_ids)
org_mask.append(sentence.attention_mask)
def create_encoder(rep_dim):
encoder = TFT5EncoderModel.from_pretrained('t5-small', output_hidden_states=True)
encoder.trainable = True
original_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
augmented_input = Input(shape=(max_length), name = 'originalIn', dtype=tf.int32)
concat = keras.layers.Concatenate(axis=1)([original_input, augmented_input])
#Take 0-index because it returns a TFBERTmodel type, and 0 returns a tensor
encoded = encoder(input_ids=concat)[0]
#This outputs shape: [sentences, max_length, encoded_dims]
output = Dense(rep_dim, activation='relu')(encoded)
return encoder
training_model.load_weights('representation_learner.h5')
feature_vectors= training_model.predict([[original_train, augmented_train]], verbose = 1)
输出为:
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
如果形状太大(1000000,128)请告诉我是否需要更多细节!:)你的模特训练成功了吗?预期?@M.Innat是的,模型成功训练,损失函数在5个时期内从10000增加到145。