Python Transformer PyTorch错误-ValueError:要解压缩的值太多(预期为2个)

Python Transformer PyTorch错误-ValueError:要解压缩的值太多(预期为2个),python,pytorch,transformer,Python,Pytorch,Transformer,我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型,我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward()函数的,但我得到了一个类型错误。以下是方向: 与前向(self、src、tgt)功能相同 TransformerModel类,您需要创建适当的 屏蔽并编码源语句(仅一次) 您还需要创建适当的掩码并对 连续预测的输出语句。与消息来源不同, 对于每一次迭代,您都需要重新编码之前的输出和

我在运行模型时遇到问题。我不确定在translate_语句函数中使用哪个模型,我尝试了model.transformer、model.encoder_de等。我相信它是基于transformer类和forward()函数的,但我得到了一个类型错误。以下是方向:

  • 与前向(self、src、tgt)功能相同 TransformerModel类,您需要创建适当的 屏蔽并编码源语句(仅一次)

  • 您还需要创建适当的掩码并对 连续预测的输出语句。与消息来源不同, 对于每一次迭代,您都需要重新编码之前的输出和 将源语句和以前的输出都传递到 变压器

  • 来自torch.nn导入变压器
    类TransformerModel(nn.Module):
    定义初始值(自身、输入、输出、ninp、nhead、npf dim、nlayers、src焊盘idx、trg焊盘idx、辍学=0.5):
    super(TransformerModel,self)。\uuuu init\uuuu()
    #------参数-----------------
    #ntoken_in:标记化后输入字的idx
    #ntoken_out:输入字的idx w.r.t.标记化
    #ninp:编码器/解码器输入中预期功能的数量
    #nhead:多注意力头部的数量
    #npf_dim:前馈层的尺寸
    #src_pad_idx:源语言中用于填充的标记
    #trg_pad_idx:目标语言中用于填充的标记
    # ----------------------------------------
    self.model_type='Transformer'
    self.pos_编码器=位置编码(ninp,退出)
    self.transformer=transformer(d_model=ninp,nhead=nhead,num_编码器层=nlayers,num_解码器层=nlayers,
    dim_前馈=npf_dim,dropout=dropout,activation='relu')
    self.encoder_en=nn.Embedding(ntoken_in,ninp)#tok_嵌入用于输入
    self.encoder_de=nn.Embedding(ntoken_out,ninp)#tok_Embedding用于输出
    self.ninp=ninp
    self.decoder=nn.Linear(ninp,ntoken\u out)
    self.src\u pad\u idx=src\u pad\u idx
    self.tgt\u pad\u idx=trg\u pad\u idx
    self.init_权重()
    定义生成密钥掩码(self,src):
    #用于变压器中的按键填充屏蔽
    #值为True的位置将被忽略,而位置
    #值为False时将保持不变。我们掩盖了所有的填充词。
    #输出尺寸为b*s
    src\u mask=(src==self.src\u pad\u idx)
    返回src_mask.T
    def_生成_tgt_掩码(self、tgt、sz):
    #在转换器中的按键_padding _mask旁边,是输出或教师输入
    #应按顺序屏蔽,以防止模型获取任何信息
    #它将从未来的词语中预测
    tgt\u key\u mask=tgt==self.tgt\u pad\u idx
    #我们提供浮动张量attn_面罩。它将被添加到注意权重中。
    掩码=(torch.triu(torch.one(sz,sz))==1.转置(0,1)
    attn_mask=mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0)).to(tgt.device)
    返回attn_mask,tgt_key_mask.T
    def初始重量(自身):
    初始范围=0.1
    self.encoder_en.weight.data.uniform_(-initrange,initrange)
    self.encoder_de.weight.data.uniform_(-initrange,initrange)
    self.decoder.bias.data.zero_389;()
    self.decoder.weight.data.uniform.(-initrange,initrange)
    def转发(自、src、tgt):
    #src
    src\u key\u mask=self.\u生成\u src\u key\u mask(src)
    src=self.encoder_en(src)*math.sqrt(self.ninp)#使用已学习的编码器将stoi索引放入特征空间s*b-->s*b*e
    src=self.pos_编码器(src)#向特征空间添加pos特征
    #tgt
    tgt_掩码,tgt_键_掩码=自。生成tgt_掩码(tgt,tgt.size(0))
    tgt=self.encoder_de(tgt)*math.sqrt(self.ninp)
    tgt=自身位置编码器(tgt)
    输出=自变压器(src、tgt、tgt\U掩码=tgt\U掩码,
    src_key_padding_mask=src_key_mask,
    tgt_键_填充_掩码=tgt_键_掩码)
    输出=自译码器(输出)
    返回输出
    类位置编码(nn.Module):
    #本文描述的位置编码
    # https://arxiv.org/pdf/1706.03762.pdf
    定义初始值(自,d_模型,辍学=0.1,最大长度=5000):
    super(位置编码,self)。\uuuu init\uuuuu()
    self.dropout=nn.dropout(p=dropout)
    pe=火炬零点(最大长度,d型)
    位置=火炬.arange(0,最大长度,数据类型=火炬.float)。取消队列(1)
    div_term=torch.exp(torch.arange(0,d_模型,2).float()*(-math.log(10000.0)/d_模型))
    pe[:,0::2]=火炬.sin(位置*div_项)
    pe[:,1::2]=torch.cos(位置*div_术语)
    pe=pe.unsqueze(0)。转置(0,1)
    自寄存器缓冲区('pe',pe)
    def前进(自身,x):
    x=x+self.pe[:x.size(0),:]
    返回自退出(x)
    #这里我们初始化我们的模型
    输入_DIM=len(SRC.vocab)
    输出尺寸=len(训练语音)
    打印(输入尺寸、输出尺寸)
    HID_DIM=256
    N_层=3
    N_头=8
    N_PF_DIM=512
    辍学率=0.1
    SRC_PAD_IDX=SRC.vocab.stoi[SRC.PAD_token]
    TRG\u PAD\u IDX=TRG.vocab.stoi[TRG.PAD\u token]
    模型=变压器模型(ntoken\u in=输入尺寸,ntoken\u out=输出尺寸,ninp=隐藏尺寸,
    nhead=N_头,npf_尺寸=N_PF_尺寸,nlayers=N_层,
    src_pad_idx=src_pad_idx,trg_pad_idx=trg_pad_idx,dropout=dropout)。到(设备)
    def计数_参数(型号:nn.模块):
    返回和(如果p.需要_grad,则模型参数()中p的p.numel()值)
    打印(f'模型具有{count_参数(模型):,}可训练参数')
    def初始化_重量(m):
    如果hasattr(m,'weight')和m.weight.dim()>1:
    nn.init.xavier_uniform_(m.weight.data)
    
    from torch.nn import Transformer
    class TransformerModel(nn.Module):
    
        def __init__(self, ntoken_in, ntoken_out, ninp, nhead, npf_dim, nlayers, src_pad_idx, trg_pad_idx, dropout=0.5):
            super(TransformerModel, self).__init__()
    
            # --------------- param -----------------
            # ntoken_in: the idx of the input word after tokenization 
            # ntoken_out: the idx of the input word w.r.t. the tokenization 
            # ninp: the number of expected features in the encoder/decoder inputs 
            # nhead: the number of multiAttention heads 
            # npf_dim: the dimension of the feedforward layer 
            # src_pad_idx: the token for padding in source language
            # trg_pad_idx: the token for padding in target language 
            # ----------------------------------------
    
            self.model_type = 'Transformer'
            self.pos_encoder = PositionalEncoding(ninp, dropout)
            self.transformer = Transformer(d_model=ninp, nhead=nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers,
                                           dim_feedforward=npf_dim, dropout=dropout, activation='relu')
          
            self.encoder_en = nn.Embedding(ntoken_in, ninp)  # tok_embedding for input 
            self.encoder_de = nn.Embedding(ntoken_out, ninp) # tok_embedding for output 
            self.ninp = ninp
            self.decoder = nn.Linear(ninp, ntoken_out)
    
            self.src_pad_idx = src_pad_idx
            self.tgt_pad_idx = trg_pad_idx
    
            self.init_weights()
    
        def _generate_src_key_mask(self, src):
            # for key_padding_mask in transformer
            # the positions with the value of True will be ignored while the position
            # with the value of False will be unchanged. We mask all padding words. 
            # The output dim is b*s
            src_mask = (src == self.src_pad_idx)
            return src_mask.T
    
        def _generate_tgt_mask(self, tgt, sz):
            # Beside key_padding_mask in transformer, the output or teacher input 
            # should be masked sequentially to prevent the model get any information 
            # from the future words it is going to predict 
            tgt_key_mask = tgt == self.tgt_pad_idx
    
            # We provide FloatTensor attn_mask. It will be added to the attention weight.
            mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
            attn_mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(tgt.device)
            return attn_mask, tgt_key_mask.T
    
        def init_weights(self):
            initrange = 0.1
            self.encoder_en.weight.data.uniform_(-initrange, initrange)
            self.encoder_de.weight.data.uniform_(-initrange, initrange)
            self.decoder.bias.data.zero_()
            self.decoder.weight.data.uniform_(-initrange, initrange)
    
        def forward(self, src, tgt):
            # src
            src_key_mask = self._generate_src_key_mask(src)
            src = self.encoder_en(src) * math.sqrt(self.ninp)  # use a learned encoder put stoi index to a feature space s*b --> s*b*e
            src = self.pos_encoder(src)  # add the pos feature toward feature space
    
            # tgt
            tgt_mask, tgt_key_mask = self._generate_tgt_mask(tgt, tgt.size(0))
            tgt = self.encoder_de(tgt) * math.sqrt(self.ninp) 
            tgt = self.pos_encoder(tgt)
    
            output = self.transformer(src, tgt, tgt_mask=tgt_mask, 
                                      src_key_padding_mask = src_key_mask, 
                                      tgt_key_padding_mask = tgt_key_mask)
            output = self.decoder(output)
            return output
    
    class PositionalEncoding(nn.Module):
        # The positional encoding as described in the paper 
        # https://arxiv.org/pdf/1706.03762.pdf
        def __init__(self, d_model, dropout=0.1, max_len=5000):
            super(PositionalEncoding, self).__init__()
            self.dropout = nn.Dropout(p=dropout)
    
            pe = torch.zeros(max_len, d_model)
            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
            div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0).transpose(0, 1)
            self.register_buffer('pe', pe)
    
        def forward(self, x):
            x = x + self.pe[:x.size(0), :]
            return self.dropout(x)
    
    # Here we intialize our model
    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)
    print(INPUT_DIM, OUTPUT_DIM)
    
    HID_DIM = 256
    N_LAYERS = 3
    N_HEADS = 8
    N_PF_DIM = 512
    DROPOUT = 0.1
    
    SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
    TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
    
    model =TransformerModel(ntoken_in = INPUT_DIM, ntoken_out=OUTPUT_DIM, ninp=HID_DIM, 
                            nhead=N_HEADS, npf_dim=N_PF_DIM, nlayers=N_LAYERS,
                            src_pad_idx=SRC_PAD_IDX, trg_pad_idx=TRG_PAD_IDX, dropout=DROPOUT).to(device)
    
    def count_parameters(model: nn.Module):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f'The model has {count_parameters(model):,} trainable parameters')
    
    def initialize_weights(m):
        if hasattr(m, 'weight') and m.weight.dim() > 1:
            nn.init.xavier_uniform_(m.weight.data)
    
    model.apply(initialize_weights)
    
    
    ----------
    def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
        model.eval()
            
        if isinstance(sentence, str):
            nlp = spacy.load('de')
            tokens = [token.text.lower() for token in nlp(sentence)]
        else:
            tokens = [token.lower() for token in sentence]
    
        #tokens = [src_field.init_token] + tokens + [src_field.eos_token]  
        src_indexes = [src_field.vocab.stoi[token] for token in tokens]
        src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    
        with torch.no_grad():
            #model.?
            hidden, cell = model.encoder_en(src_tensor)
    
        # create a list to hold the output sentence, initialized with an <sos> token   
        
        trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
        
        for i in range(max_len):
    
            trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)
            with torch.no_grad():
                #model.?
                output, hidden, cell = model.encoder_de(trg_tensor, hidden, cell)
                
            pred_token = output.argmax(1).item()
            
            trg_indexes.append(pred_token)
    
            if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
                break
        
        trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
        
        return trg_tokens[1:]
    
    
    ----------
    #getting error here
    
    example_idx = 18
    
    src = vars(train_data.examples[example_idx])['src']
    trg = vars(train_data.examples[example_idx])['trg']
    
    print(f'src = {src}')
    print(f'trg = {trg}')
    
    translation = translate_sentence(src, TRG, SRC, model, device)
    
    print(f'predicted trg = {translation}')