Python Transformer PyTorch错误-ValueError:要解压缩的值太多(预期为2个)
我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型,我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward()函数的,但我得到了一个类型错误。以下是方向:Python Transformer PyTorch错误-ValueError:要解压缩的值太多(预期为2个),python,pytorch,transformer,Python,Pytorch,Transformer,我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型,我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward()函数的,但我得到了一个类型错误。以下是方向: 与前向(self、src、tgt)功能相同 TransformerModel类,您需要创建适当的 屏蔽并编码源语句(仅一次) 您还需要创建适当的掩码并对 连续预测的输出语句。与消息来源不同, 对于每一次迭代,您都需要重新编码之前的输出和
来自torch.nn导入变压器
类TransformerModel(nn.Module):
定义初始值(自身、输入、输出、ninp、nhead、npf dim、nlayers、src焊盘idx、trg焊盘idx、辍学=0.5):
super(TransformerModel,self)。\uuuu init\uuuu()
#------参数-----------------
#ntoken_in:标记化后输入字的idx
#ntoken_out:输入字的idx w.r.t.标记化
#ninp:编码器/解码器输入中预期功能的数量
#nhead:多注意力头部的数量
#npf_dim:前馈层的尺寸
#src_pad_idx:源语言中用于填充的标记
#trg_pad_idx:目标语言中用于填充的标记
# ----------------------------------------
self.model_type='Transformer'
self.pos_编码器=位置编码(ninp,退出)
self.transformer=transformer(d_model=ninp,nhead=nhead,num_编码器层=nlayers,num_解码器层=nlayers,
dim_前馈=npf_dim,dropout=dropout,activation='relu')
self.encoder_en=nn.Embedding(ntoken_in,ninp)#tok_嵌入用于输入
self.encoder_de=nn.Embedding(ntoken_out,ninp)#tok_Embedding用于输出
self.ninp=ninp
self.decoder=nn.Linear(ninp,ntoken\u out)
self.src\u pad\u idx=src\u pad\u idx
self.tgt\u pad\u idx=trg\u pad\u idx
self.init_权重()
定义生成密钥掩码(self,src):
#用于变压器中的按键填充屏蔽
#值为True的位置将被忽略,而位置
#值为False时将保持不变。我们掩盖了所有的填充词。
#输出尺寸为b*s
src\u mask=(src==self.src\u pad\u idx)
返回src_mask.T
def_生成_tgt_掩码(self、tgt、sz):
#在转换器中的按键_padding _mask旁边,是输出或教师输入
#应按顺序屏蔽,以防止模型获取任何信息
#它将从未来的词语中预测
tgt\u key\u mask=tgt==self.tgt\u pad\u idx
#我们提供浮动张量attn_面罩。它将被添加到注意权重中。
掩码=(torch.triu(torch.one(sz,sz))==1.转置(0,1)
attn_mask=mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0)).to(tgt.device)
返回attn_mask,tgt_key_mask.T
def初始重量(自身):
初始范围=0.1
self.encoder_en.weight.data.uniform_(-initrange,initrange)
self.encoder_de.weight.data.uniform_(-initrange,initrange)
self.decoder.bias.data.zero_389;()
self.decoder.weight.data.uniform.(-initrange,initrange)
def转发(自、src、tgt):
#src
src\u key\u mask=self.\u生成\u src\u key\u mask(src)
src=self.encoder_en(src)*math.sqrt(self.ninp)#使用已学习的编码器将stoi索引放入特征空间s*b-->s*b*e
src=self.pos_编码器(src)#向特征空间添加pos特征
#tgt
tgt_掩码,tgt_键_掩码=自。生成tgt_掩码(tgt,tgt.size(0))
tgt=self.encoder_de(tgt)*math.sqrt(self.ninp)
tgt=自身位置编码器(tgt)
输出=自变压器(src、tgt、tgt\U掩码=tgt\U掩码,
src_key_padding_mask=src_key_mask,
tgt_键_填充_掩码=tgt_键_掩码)
输出=自译码器(输出)
返回输出
类位置编码(nn.Module):
#本文描述的位置编码
# https://arxiv.org/pdf/1706.03762.pdf
定义初始值(自,d_模型,辍学=0.1,最大长度=5000):
super(位置编码,self)。\uuuu init\uuuuu()
self.dropout=nn.dropout(p=dropout)
pe=火炬零点(最大长度,d型)
位置=火炬.arange(0,最大长度,数据类型=火炬.float)。取消队列(1)
div_term=torch.exp(torch.arange(0,d_模型,2).float()*(-math.log(10000.0)/d_模型))
pe[:,0::2]=火炬.sin(位置*div_项)
pe[:,1::2]=torch.cos(位置*div_术语)
pe=pe.unsqueze(0)。转置(0,1)
自寄存器缓冲区('pe',pe)
def前进(自身,x):
x=x+self.pe[:x.size(0),:]
返回自退出(x)
#这里我们初始化我们的模型
输入_DIM=len(SRC.vocab)
输出尺寸=len(训练语音)
打印(输入尺寸、输出尺寸)
HID_DIM=256
N_层=3
N_头=8
N_PF_DIM=512
辍学率=0.1
SRC_PAD_IDX=SRC.vocab.stoi[SRC.PAD_token]
TRG\u PAD\u IDX=TRG.vocab.stoi[TRG.PAD\u token]
模型=变压器模型(ntoken\u in=输入尺寸,ntoken\u out=输出尺寸,ninp=隐藏尺寸,
nhead=N_头,npf_尺寸=N_PF_尺寸,nlayers=N_层,
src_pad_idx=src_pad_idx,trg_pad_idx=trg_pad_idx,dropout=dropout)。到(设备)
def计数_参数(型号:nn.模块):
返回和(如果p.需要_grad,则模型参数()中p的p.numel()值)
打印(f'模型具有{count_参数(模型):,}可训练参数')
def初始化_重量(m):
如果hasattr(m,'weight')和m.weight.dim()>1:
nn.init.xavier_uniform_(m.weight.data)
from torch.nn import Transformer
class TransformerModel(nn.Module):
def __init__(self, ntoken_in, ntoken_out, ninp, nhead, npf_dim, nlayers, src_pad_idx, trg_pad_idx, dropout=0.5):
super(TransformerModel, self).__init__()
# --------------- param -----------------
# ntoken_in: the idx of the input word after tokenization
# ntoken_out: the idx of the input word w.r.t. the tokenization
# ninp: the number of expected features in the encoder/decoder inputs
# nhead: the number of multiAttention heads
# npf_dim: the dimension of the feedforward layer
# src_pad_idx: the token for padding in source language
# trg_pad_idx: the token for padding in target language
# ----------------------------------------
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(ninp, dropout)
self.transformer = Transformer(d_model=ninp, nhead=nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers,
dim_feedforward=npf_dim, dropout=dropout, activation='relu')
self.encoder_en = nn.Embedding(ntoken_in, ninp) # tok_embedding for input
self.encoder_de = nn.Embedding(ntoken_out, ninp) # tok_embedding for output
self.ninp = ninp
self.decoder = nn.Linear(ninp, ntoken_out)
self.src_pad_idx = src_pad_idx
self.tgt_pad_idx = trg_pad_idx
self.init_weights()
def _generate_src_key_mask(self, src):
# for key_padding_mask in transformer
# the positions with the value of True will be ignored while the position
# with the value of False will be unchanged. We mask all padding words.
# The output dim is b*s
src_mask = (src == self.src_pad_idx)
return src_mask.T
def _generate_tgt_mask(self, tgt, sz):
# Beside key_padding_mask in transformer, the output or teacher input
# should be masked sequentially to prevent the model get any information
# from the future words it is going to predict
tgt_key_mask = tgt == self.tgt_pad_idx
# We provide FloatTensor attn_mask. It will be added to the attention weight.
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
attn_mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(tgt.device)
return attn_mask, tgt_key_mask.T
def init_weights(self):
initrange = 0.1
self.encoder_en.weight.data.uniform_(-initrange, initrange)
self.encoder_de.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, src, tgt):
# src
src_key_mask = self._generate_src_key_mask(src)
src = self.encoder_en(src) * math.sqrt(self.ninp) # use a learned encoder put stoi index to a feature space s*b --> s*b*e
src = self.pos_encoder(src) # add the pos feature toward feature space
# tgt
tgt_mask, tgt_key_mask = self._generate_tgt_mask(tgt, tgt.size(0))
tgt = self.encoder_de(tgt) * math.sqrt(self.ninp)
tgt = self.pos_encoder(tgt)
output = self.transformer(src, tgt, tgt_mask=tgt_mask,
src_key_padding_mask = src_key_mask,
tgt_key_padding_mask = tgt_key_mask)
output = self.decoder(output)
return output
class PositionalEncoding(nn.Module):
# The positional encoding as described in the paper
# https://arxiv.org/pdf/1706.03762.pdf
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
# Here we intialize our model
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
print(INPUT_DIM, OUTPUT_DIM)
HID_DIM = 256
N_LAYERS = 3
N_HEADS = 8
N_PF_DIM = 512
DROPOUT = 0.1
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
model =TransformerModel(ntoken_in = INPUT_DIM, ntoken_out=OUTPUT_DIM, ninp=HID_DIM,
nhead=N_HEADS, npf_dim=N_PF_DIM, nlayers=N_LAYERS,
src_pad_idx=SRC_PAD_IDX, trg_pad_idx=TRG_PAD_IDX, dropout=DROPOUT).to(device)
def count_parameters(model: nn.Module):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
def initialize_weights(m):
if hasattr(m, 'weight') and m.weight.dim() > 1:
nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights)
----------
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
model.eval()
if isinstance(sentence, str):
nlp = spacy.load('de')
tokens = [token.text.lower() for token in nlp(sentence)]
else:
tokens = [token.lower() for token in sentence]
#tokens = [src_field.init_token] + tokens + [src_field.eos_token]
src_indexes = [src_field.vocab.stoi[token] for token in tokens]
src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
with torch.no_grad():
#model.?
hidden, cell = model.encoder_en(src_tensor)
# create a list to hold the output sentence, initialized with an <sos> token
trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
for i in range(max_len):
trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)
with torch.no_grad():
#model.?
output, hidden, cell = model.encoder_de(trg_tensor, hidden, cell)
pred_token = output.argmax(1).item()
trg_indexes.append(pred_token)
if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
break
trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
return trg_tokens[1:]
----------
#getting error here
example_idx = 18
src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']
print(f'src = {src}')
print(f'trg = {trg}')
translation = translate_sentence(src, TRG, SRC, model, device)
print(f'predicted trg = {translation}')