Nlp 从零开始培训GPT2和改革者

Nlp 从零开始培训GPT2和改革者,nlp,huggingface-transformers,huggingface-tokenizers,Nlp,Huggingface Transformers,Huggingface Tokenizers,例如,我正在寻找用德语从头开始训练GPT2和改革者模型的脚本/笔记本。 类似于: 我试图修改同一个笔记本,但GPT2似乎不接受LinebyLineDataset或填充 我的错误是: --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ti

例如,我正在寻找用德语从头开始训练GPT2和改革者模型的脚本/笔记本。 类似于:

我试图修改同一个笔记本,但GPT2似乎不接受LinebyLineDataset或填充

我的错误是:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed eval> in <module>

~/anaconda3/envs/thesis_p1/lib/python3.6/site-packages/transformers/trainer.py in train(self, model_path)
    490                 self._past = None
    491 
--> 492             for step, inputs in enumerate(epoch_iterator):
    493 
    494                 # Skip past any already trained steps if resuming training

~/anaconda3/envs/thesis_p1/lib/python3.6/site-packages/tqdm/notebook.py in __iter__(self, *args, **kwargs)
    226     def __iter__(self, *args, **kwargs):
    227         try:
--> 228             for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
    229                 # return super(tqdm...) will not catch exception
    230                 yield obj

~/anaconda3/envs/thesis_p1/lib/python3.6/site-packages/tqdm/std.py in __iter__(self)
   1128 
   1129         try:
-> 1130             for obj in iterable:
   1131                 yield obj
   1132                 # Update and possibly print the progressbar.

~/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
    344     def __next__(self):
    345         index = self._next_index()  # may raise StopIteration
--> 346         data = self.dataset_fetcher.fetch(index)  # may raise StopIteration
    347         if self.pin_memory:
    348             data = _utils.pin_memory.pin_memory(data)

~/.local/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

~/anaconda3/envs/thesis_p1/lib/python3.6/site-packages/transformers/data/data_collator.py in __call__(self, examples)
     79 
     80     def __call__(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
---> 81         batch = self._tensorize_batch(examples)
     82         if self.mlm:
     83             inputs, labels = self.mask_tokens(batch)

~/anaconda3/envs/thesis_p1/lib/python3.6/site-packages/transformers/data/data_collator.py in _tensorize_batch(self, examples)
     96             if self.tokenizer._pad_token is None:
     97                 raise ValueError(
---> 98                     "You are attempting to pad samples but the tokenizer you are using"
     99                     f" ({self.tokenizer.__class__.__name__}) does not have one."
    100                 )

ValueError: You are attempting to pad samples but the tokenizer you are using (GPT2Tokenizer) does not have one.
首先,我训练句子片段标记器:

from pathlib import Path
import sentencepiece as spm
paths = [str(x) for x in Path(".").glob("**/*.txt")]
arg='--input=deu-de_web-public_2019_1M-sentences.txt --model_prefix=m_test --vocab_size=52000'
spm.SentencePieceTrainer.train(arg)
然后,我加载我的GPT2标记器,如下所示:

from transformers import GPT2TokenizerFast

tokenizer = GPT2Tokenizer.from_pretrained("./German",additional_special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"], max_len=512)
数据集准备的逻辑:

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./deu-de_web-public_2019_1M-sentences.txt",
    block_size=128,
)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)
培训逻辑:

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)
trainer.train()

@我已经添加了我当前的实现和堆栈跟踪。希望清楚。您不能将LineByLineTextDataset类与前面提到的GPT2一起使用。改用
TextDataset
。@cronoik我已经添加了我当前的实现和堆栈跟踪。希望清楚。您不能将LineByLineTextDataset类与前面提到的GPT2一起使用。改用
TextDataset
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./deu-de_web-public_2019_1M-sentences.txt",
    block_size=128,
)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)
trainer.train()