Python Transformer PyTorch错误-ValueError:要解压缩的值太多（预期为2个）_Python_Pytorch_Transformer

Python Transformer PyTorch错误-ValueError:要解压缩的值太多（预期为2个）

python pytorch

Python Transformer PyTorch错误-ValueError:要解压缩的值太多（预期为2个）,python,pytorch,transformer,Python,Pytorch,Transformer,我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型，我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward（）函数的，但我得到了一个类型错误。以下是方向：与前向（self、src、tgt）功能相同 TransformerModel类，您需要创建适当的屏蔽并编码源语句（仅一次）您还需要创建适当的掩码并对连续预测的输出语句。与消息来源不同，对于每一次迭代，您都需要重新编码之前的输出和

我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型，我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward（）函数的，但我得到了一个类型错误。以下是方向：

与前向（self、src、tgt）功能相同 TransformerModel类，您需要创建适当的屏蔽并编码源语句（仅一次）

您还需要创建适当的掩码并对连续预测的输出语句。与消息来源不同，对于每一次迭代，您都需要重新编码之前的输出和将源语句和以前的输出都传递到变压器

来自torch.nn导入变压器
类TransformerModel（nn.Module）：
定义初始值（自身、输入、输出、ninp、nhead、npf dim、nlayers、src焊盘idx、trg焊盘idx、辍学=0.5）：
super（TransformerModel，self）。\uuuu init\uuuu（）
#------参数-----------------
#ntoken_in：标记化后输入字的idx
#ntoken_out：输入字的idx w.r.t.标记化
#ninp：编码器/解码器输入中预期功能的数量
#nhead：多注意力头部的数量
#npf_dim：前馈层的尺寸
#src_pad_idx：源语言中用于填充的标记
#trg_pad_idx：目标语言中用于填充的标记
# ----------------------------------------
self.model_type='Transformer'
self.pos_编码器=位置编码（ninp，退出）
self.transformer=transformer（d_model=ninp，nhead=nhead，num_编码器层=nlayers，num_解码器层=nlayers，
dim_前馈=npf_dim，dropout=dropout，activation='relu'）
self.encoder_en=nn.Embedding（ntoken_in，ninp）#tok_嵌入用于输入
self.encoder_de=nn.Embedding（ntoken_out，ninp）#tok_Embedding用于输出
self.ninp=ninp
self.decoder=nn.Linear（ninp，ntoken\u out）
self.src\u pad\u idx=src\u pad\u idx
self.tgt\u pad\u idx=trg\u pad\u idx
self.init_权重（）
定义生成密钥掩码（self，src）：
#用于变压器中的按键填充屏蔽
#值为True的位置将被忽略，而位置
#值为False时将保持不变。我们掩盖了所有的填充词。
#输出尺寸为b*s
src\u mask=（src==self.src\u pad\u idx）
返回src_mask.T
def_生成_tgt_掩码（self、tgt、sz）：
#在转换器中的按键_padding _mask旁边，是输出或教师输入
#应按顺序屏蔽，以防止模型获取任何信息
#它将从未来的词语中预测
tgt\u key\u mask=tgt==self.tgt\u pad\u idx
#我们提供浮动张量attn_面罩。它将被添加到注意权重中。
掩码=（torch.triu（torch.one（sz，sz））==1.转置（0，1）
attn_mask=mask.float（）.masked_fill（mask==0，float（'-inf'））.masked_fill（mask==1，float（0.0））.to（tgt.device）
返回attn_mask，tgt_key_mask.T
def初始重量（自身）：
初始范围=0.1
self.encoder_en.weight.data.uniform_（-initrange，initrange）
self.encoder_de.weight.data.uniform_（-initrange，initrange）
self.decoder.bias.data.zero_389;（）
self.decoder.weight.data.uniform.（-initrange，initrange）
def转发（自、src、tgt）：
#src
src\u key\u mask=self.\u生成\u src\u key\u mask（src）
src=self.encoder_en（src）*math.sqrt（self.ninp）#使用已学习的编码器将stoi索引放入特征空间s*b-->s*b*e
src=self.pos_编码器（src）#向特征空间添加pos特征
#tgt
tgt_掩码，tgt_键_掩码=自。生成tgt_掩码（tgt，tgt.size（0））
tgt=self.encoder_de（tgt）*math.sqrt（self.ninp）
tgt=自身位置编码器（tgt）
输出=自变压器（src、tgt、tgt\U掩码=tgt\U掩码，
src_key_padding_mask=src_key_mask，
tgt_键_填充_掩码=tgt_键_掩码）
输出=自译码器（输出）
返回输出
类位置编码（nn.Module）：
#本文描述的位置编码
# https://arxiv.org/pdf/1706.03762.pdf
定义初始值（自，d_模型，辍学=0.1，最大长度=5000）：
super（位置编码，self）。\uuuu init\uuuuu（）
self.dropout=nn.dropout（p=dropout）
pe=火炬零点（最大长度，d型）
位置=火炬.arange（0，最大长度，数据类型=火炬.float）。取消队列（1）
div_term=torch.exp（torch.arange（0，d_模型，2）.float（）*（-math.log（10000.0）/d_模型））
pe[：，0:：2]=火炬.sin（位置*div_项）
pe[：，1：：2]=torch.cos（位置*div_术语）
pe=pe.unsqueze（0）。转置（0，1）
自寄存器缓冲区（'pe'，pe）
def前进（自身，x）：
x=x+self.pe[：x.size（0），：]
返回自退出（x）
#这里我们初始化我们的模型
输入_DIM=len（SRC.vocab）
输出尺寸=len（训练语音）
打印（输入尺寸、输出尺寸）
HID_DIM=256
N_层=3
N_头=8
N_PF_DIM=512
辍学率=0.1
SRC_PAD_IDX=SRC.vocab.stoi[SRC.PAD_token]
TRG\u PAD\u IDX=TRG.vocab.stoi[TRG.PAD\u token]
模型=变压器模型（ntoken\u in=输入尺寸，ntoken\u out=输出尺寸，ninp=隐藏尺寸，
nhead=N_头，npf_尺寸=N_PF_尺寸，nlayers=N_层，
src_pad_idx=src_pad_idx，trg_pad_idx=trg_pad_idx，dropout=dropout）。到（设备）
def计数_参数（型号：nn.模块）：
返回和（如果p.需要_grad，则模型参数（）中p的p.numel（）值）
打印（f'模型具有{count_参数（模型）：，}可训练参数'）
def初始化_重量（m）：
如果hasattr（m，'weight'）和m.weight.dim（）>1：
nn.init.xavier_uniform_（m.weight.data）
from torch.nn import Transformer
class TransformerModel(nn.Module):

    def __init__(self, ntoken_in, ntoken_out, ninp, nhead, npf_dim, nlayers, src_pad_idx, trg_pad_idx, dropout=0.5):
        super(TransformerModel, self).__init__()

        # --------------- param -----------------
        # ntoken_in: the idx of the input word after tokenization 
        # ntoken_out: the idx of the input word w.r.t. the tokenization 
        # ninp: the number of expected features in the encoder/decoder inputs 
        # nhead: the number of multiAttention heads 
        # npf_dim: the dimension of the feedforward layer 
        # src_pad_idx: the token for padding in source language
        # trg_pad_idx: the token for padding in target language 
        # ----------------------------------------

        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.transformer = Transformer(d_model=ninp, nhead=nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers,
                                       dim_feedforward=npf_dim, dropout=dropout, activation='relu')
      
        self.encoder_en = nn.Embedding(ntoken_in, ninp)  # tok_embedding for input 
        self.encoder_de = nn.Embedding(ntoken_out, ninp) # tok_embedding for output 
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken_out)

        self.src_pad_idx = src_pad_idx
        self.tgt_pad_idx = trg_pad_idx

        self.init_weights()

    def _generate_src_key_mask(self, src):
        # for key_padding_mask in transformer
        # the positions with the value of True will be ignored while the position
        # with the value of False will be unchanged. We mask all padding words. 
        # The output dim is b*s
        src_mask = (src == self.src_pad_idx)
        return src_mask.T

    def _generate_tgt_mask(self, tgt, sz):
        # Beside key_padding_mask in transformer, the output or teacher input 
        # should be masked sequentially to prevent the model get any information 
        # from the future words it is going to predict 
        tgt_key_mask = tgt == self.tgt_pad_idx

        # We provide FloatTensor attn_mask. It will be added to the attention weight.
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        attn_mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(tgt.device)
        return attn_mask, tgt_key_mask.T

    def init_weights(self):
        initrange = 0.1
        self.encoder_en.weight.data.uniform_(-initrange, initrange)
        self.encoder_de.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt):
        # src
        src_key_mask = self._generate_src_key_mask(src)
        src = self.encoder_en(src) * math.sqrt(self.ninp)  # use a learned encoder put stoi index to a feature space s*b --> s*b*e
        src = self.pos_encoder(src)  # add the pos feature toward feature space

        # tgt
        tgt_mask, tgt_key_mask = self._generate_tgt_mask(tgt, tgt.size(0))
        tgt = self.encoder_de(tgt) * math.sqrt(self.ninp) 
        tgt = self.pos_encoder(tgt)

        output = self.transformer(src, tgt, tgt_mask=tgt_mask, 
                                  src_key_padding_mask = src_key_mask, 
                                  tgt_key_padding_mask = tgt_key_mask)
        output = self.decoder(output)
        return output

class PositionalEncoding(nn.Module):
    # The positional encoding as described in the paper 
    # https://arxiv.org/pdf/1706.03762.pdf
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Here we intialize our model
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
print(INPUT_DIM, OUTPUT_DIM)

HID_DIM = 256
N_LAYERS = 3
N_HEADS = 8
N_PF_DIM = 512
DROPOUT = 0.1

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model =TransformerModel(ntoken_in = INPUT_DIM, ntoken_out=OUTPUT_DIM, ninp=HID_DIM, 
                        nhead=N_HEADS, npf_dim=N_PF_DIM, nlayers=N_LAYERS,
                        src_pad_idx=SRC_PAD_IDX, trg_pad_idx=TRG_PAD_IDX, dropout=DROPOUT).to(device)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)


----------
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    #tokens = [src_field.init_token] + tokens + [src_field.eos_token]  
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        #model.?
        hidden, cell = model.encoder_en(src_tensor)

    # create a list to hold the output sentence, initialized with an <sos> token   
    
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    
    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)
        with torch.no_grad():
            #model.?
            output, hidden, cell = model.encoder_de(trg_tensor, hidden, cell)
            
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]


----------
#getting error here

example_idx = 18

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

translation = translate_sentence(src, TRG, SRC, model, device)

print(f'predicted trg = {translation}')