Python 结合RoBERTa和BiLSTM时F分数为0

Python 结合RoBERTa和BiLSTM时F分数为0,python,nlp,pytorch,bert-language-model,Python,Nlp,Pytorch,Bert Language Model,我试图在二元分类问题的RoBERTa模型的基础上叠加一个LSTM 我已尝试: -冷冻罗伯塔包埋 -微调嵌入 在冻结情况下,我得到了57%左右的F分数,与常规的RoBERTa序列分类相比,这个分数相对较低,后者得到了81%左右的相同数据 在微调情况下,我得到0%的F分数,验证损失不会收敛 很可能我做错了什么,但我真的看不出来 这是模型部分 class RoBERTaLSTMClassifier(nn.Module): def __init__(self, bert_config, num

我试图在二元分类问题的RoBERTa模型的基础上叠加一个LSTM 我已尝试: -冷冻罗伯塔包埋 -微调嵌入

在冻结情况下,我得到了57%左右的F分数,与常规的RoBERTa序列分类相比,这个分数相对较低,后者得到了81%左右的相同数据

在微调情况下,我得到0%的F分数,验证损失不会收敛

很可能我做错了什么,但我真的看不出来

这是模型部分


class RoBERTaLSTMClassifier(nn.Module):
    def __init__(self, bert_config, num_classes, hidden_size=None, dropout=0.5):
        """
        bert: pretrained bert model
        num_classes: the number of num_classes
        hidden_size: the number of hiddens which will be used by LSTM layer
        dropout: dropout rate
        """
        super(RoBERTaLSTMClassifier, self).__init__()
        self.num_classes = num_classes
        self.model = RobertaModel(bert_config)
        if hidden_size is None: self.hidden_size = bert_config.hidden_size
        else: self.hidden_size = hidden_size
        self.lstm = nn.LSTM(bert_config.hidden_size, self.hidden_size, bidirectional=True,batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.hidden_size * 2, 1)
        self.softmax = nn.Softmax()
        ## add sigmoid non linearity for binary classification
        self.sig           = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, current_batch_size, hidden):
        """
        all_layers: whether or not to return all encoded_layers
        return: logits in the following format (batch_size, num_classes)
        """
        with torch.no_grad():
             ## freeze embedding from BERT
             outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
       # last hidden state is input to the LSTM
        output, (hidden_h, hidden_c) = self.lstm(outputs[0], hidden)

        output_hidden = torch.cat((hidden_h[0], hidden_h[1]), dim=1) #[B, H*2]

        logits = self.classifier(self.dropout(output_hidden)) #[B, C]
        sig_out = self.sig(logits).view(current_batch_size, -1)

        ## get the last batch output
        sig_out = sig_out[:, -1] # get last batch of labels
        hidden = (hidden_h, hidden_c)

        return sig_out, hidden

    def init_bilstm_hidden(self, batch_size):
        h0 = torch.zeros(2, batch_size, self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(2, batch_size, self.hidden_size).to(device)

        return (h0, c0)
**这是训练循环部分**

from sklearn.metrics import f1_score
from tqdm import tqdm, trange
import numpy as np
lr=0.0001
roberta_conf = RobertaConfig.from_pretrained('roberta-base')

num_classes = 2
hidden_size = 256

LSTMRoBERTaModel =  RoBERTaLSTMClassifier(roberta_conf, num_classes=num_classes,hidden_size= hidden_size,dropout=0.5)
criterion = nn.BCELoss() ## binary cross entropy
optimizer = torch.optim.Adam(LSTMRoBERTaModel.parameters(), lr=lr)

epochs = 5
counter = 0
max_grad_norm = 1.0

nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(epochs, desc="Epoch"):
    LSTMRoBERTaModel.cuda()
    LSTMRoBERTaModel.train()
    tr_loss = 0
    y_preds = []
    y_true = []
    hidden_init = LSTMRoBERTaModel.init_bilstm_hidden(batch_size=bs)
    h = hidden_init
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        current_batch_size = b_input_ids.size()[0]
        ##

        h = tuple([each.data for each in h])

        ## forward pass
        preds, h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size,h)

        loss = criterion(preds.squeeze(),b_labels.float())

        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=LSTMRoBERTaModel.parameters(), max_norm=max_grad_norm)

        loss.backward()
        optimizer.step()
        LSTMRoBERTaModel.zero_grad()
    # print train loss per epoch
    print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))     

    LSTMRoBERTaModel.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0


    val_h = LSTMRoBERTaModel.init_bilstm_hidden(bs)

    for batch in dev_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        current_batch_size = b_input_ids.size()[0]
        with torch.no_grad():

             preds, val_h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size, val_h)
             loss = criterion(preds.squeeze(),b_labels.float())

        eval_loss += loss
        y_preds.extend(np.round(preds.data.cpu()))
        y_true.extend(b_labels.data.cpu())
        #print(preds[2], b_labels[2] )
        #eval_accuracy += f1_score(torch.tensor.numpy(b_labels.float), toch.tensor.numpy(preds))
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps    
    print("Validation loss: {}".format(eval_loss))
    print("F1 - Score: {}".format(f1_score(y_true,y_preds)))
    #print("F1- Score: {}".format(eval_accuracy/nb_eval_steps))     



为什么在循环开始时调用
lstmrobertmodel.zero\u grad()
而不是
optimizer.zero\u grad()
?我认为它们是一样的,因为我已经在循环优化器=torch.optim.Adam(lstmrobertmodel.parameters(),lr=lr)之前定义了优化器,以确保用优化器.zero\u grad()替换它没有任何改变你没有掩盖损失。它将为批处理中的所有位置(包括应遮罩的位置)获取平均值。但不确定它是否能产生如此显著的效果。为什么在循环开始时调用
lstmrobertmodel.zero\u grad()
而不是
optimizer.zero\u grad()
?我认为它们是一样的,因为我已经在循环优化器=torch.optim.Adam(lstmrobertmodel.parameters(),lr=lr)之前定义了优化器为了确保我用optimizer.zero_grad()替换它,并且没有任何变化,您没有掩盖损失。它将为批处理中的所有位置(包括应遮罩的位置)获取平均值。但不确定它是否能产生如此戏剧性的效果。