Pytorch BERT微调:多类分类中的高损耗和低精度

Pytorch BERT微调:多类分类中的高损耗和低精度,pytorch,huggingface-transformers,pytorch-lightning,Pytorch,Huggingface Transformers,Pytorch Lightning,虽然使用微调的Bert进行二元分类效果很好,但我仍然坚持使用多类分类。我的数据集(德语新闻文章,有10个类)包含大约10000个样本。 尽管如此,培训损失和平均评估损失在2.2左右 某些NLP配置变量: DEBUG=True VERSION = 1 MAX_LEN = 200 #Set the maximum length according to the diagram above BATCH_SIZE = 16 EPOCHS = 3 LEARNING_RATE = 2e-4 MOMENTUM

虽然使用微调的Bert进行二元分类效果很好,但我仍然坚持使用多类分类。我的数据集(德语新闻文章,有10个类)包含大约10000个样本。 尽管如此,培训损失和平均评估损失在2.2左右

某些NLP配置变量:

DEBUG=True
VERSION = 1
MAX_LEN = 200 #Set the maximum length according to the diagram above
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-4
MOMENTUM = 0.9
TRAIN_SIZE = 0.7
NUM_LABELS = len(df_data.Labels.unique())
MODEL_NAME = "dbmdz/bert-base-german-cased"
params = {"debug": DEBUG, "max_len": MAX_LEN, "batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "momentum": MOMENTUM, "model": MODEL_NAME, "loss": "BCEWithLogitsLoss", "optimizer": "SGD"}
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
数据加载器:

class NLPDataset(th.utils.data.Dataset):

  def __init__(self, dataframe, tokenizer, max_len):
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.data = dataframe
    self.text = dataframe.Text
    self.targets = dataframe.Labels
    self.len = len(self.text)

  def __getitem__(self,idx):
    text = str(self.text[idx])
    text = " ".join(text.split())

    inputs = self.tokenizer(
        text,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation=True,
        padding='max_length',
    )
    input_ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'input_ids': th.tensor(input_ids, dtype=th.long),
        'mask': th.tensor(mask, dtype=th.float),
        'token_type_ids': th.tensor(token_type_ids, dtype=th.long),
        'targets': th.tensor(self.targets[idx], dtype=th.long)
    }
  
  def __len__(self):
    return self.len
PYU闪电模块:

class NLPClassifier(pl.LightningModule):

  def __init__(self):
      super().__init__()

      #changing the configuration to 10 lables instead of 2
      config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
      config.num_labels = NUM_LABELS
      self.model = transformers.AutoModelForSequenceClassification.from_config(config)
      self.loss = th.nn.CrossEntropyLoss(reduction="none")

  def prepare_data(self):
      # train/val split
      train_dataset = df_data.sample(frac=TRAIN_SIZE)
      val_dataset=df_data.drop(train_dataset.index).sample(frac=1).reset_index(drop=True)
      train_dataset = train_dataset.reset_index(drop=True)

      # Assign CustomDataset Class
      train_set = NLPDataset(train_dataset, tokenizer, MAX_LEN)
      val_set = NLPDataset(val_dataset, tokenizer, MAX_LEN)

      print("FULL Dataset: {}".format(df_data.shape))
      print("TRAIN Dataset: {}".format(train_dataset.shape))
      print("VAL Dataset: {}".format(val_dataset.shape))

      # assign to use in dataloaders
      self.train_ds = train_set
      self.val_ds = val_set
      #self.test_dataset = mnist_test DO TO
  
  def forward(self, input_ids, mask):
      logits, = self.model(input_ids, mask)
      # logits.shape: (16, 10)
      return logits
  
  def training_step(self, batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
    loss = self.loss(logits, batch['targets']).mean()
    run.log(name='train_loss', value=loss.tolist())
    return {'loss': loss, 'log': {'train_loss': loss}}
  
  def validation_step(self,batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
    print(logits.shape)
    acc = (logits.argmax(-1) == batch['targets']).float()
    loss = self.loss(logits, batch['targets'])
    run.log_list('loss', loss.tolist())
    run.log_list('acc', acc.tolist())
    return {'loss': loss, 'acc': acc}

  def validation_epoch_end(self, outputs):
    loss = th.cat([o['loss'] for o in outputs], 0).mean()
    acc = th.cat([o['acc'] for o in outputs], 0).mean()
    out = {'val_loss': loss, 'val_acc': acc}
    run.log('val_loss', loss.tolist())
    run.log('val_acc', acc.tolist())
    return {**out, 'log': {'val_loss': loss, 'val_acc': acc}}
    
  def train_dataloader(self):
      return th.utils.data.DataLoader(
          self.train_ds,
          batch_size=BATCH_SIZE,
          num_workers=8,
          drop_last=True,
          shuffle=False,
      )
  
  def val_dataloader(self):
      return th.utils.data.DataLoader(
          self.val_ds,
          batch_size=BATCH_SIZE,
          num_workers=8,
          drop_last=False,
          shuffle=False,
      )

  
  def configure_optimizers(self):
      return transformers.AdamW(
            self.model.parameters(),
            lr=LEARNING_RATE,
            #momentum=MOMENTUM,
        )
培训师:

model = NLPClassifier()
trainer = pl.Trainer(
   gpus=(1 if th.cuda.is_available() else 0),
   default_root_dir = f"./models/version_{VERSION}",
   max_epochs=EPOCHS,
   fast_dev_run=DEBUG,
   limit_train_batches=1.0,
   val_check_interval=0.5,
   limit_val_batches=1.0,
   profiler=True,
   #logger=wandb_logger
   )
教练:适合(模型)

这是一个样本损失曲线

我的主要问题是:

  • 是否正确使用无交叉熵
  • 优化器是否有效,因为每个样本的预测很快就会变得相同
  • 学习率问题并没有解决这个问题。我尝试了从1e-2到1e-6的范围

  • 谢谢你的帮助。:)

    您是否按照示例代码构建了
    NLPDataset
    类?如果是,是什么代码?最初我使用了hugging face的一个示例笔记本,如Multiclass Distilbert FineTunning。您确定此行正确吗:
    inputs=self.tokenizer(text…)
    ?我只熟悉BERT标记器,但我有根据地猜测,行应该是
    inputs=self.tokenizer.encode\u plus(…)
    。没错,transformers过去有encode\u plus和batch\u encode\u plus方法,但在新版本3.0.0中,您可以直接调用标记器。所以它应该返回与encode_plus相同的结果。即输入标识、注意标识和令牌类型标识。