什么';s Pytorch.Embedding中from_pretrained和weight.data.copy的差异

什么';s Pytorch.Embedding中from_pretrained和weight.data.copy的差异,pytorch,embedding,glove,Pytorch,Embedding,Glove,最近,我使用手套来初始化嵌入的权重,我发现from_pretrained和weight.data.copy的区别在于:from_pretrained={weight.data.copy();Embedding.weight.requires_grad=False} 我想知道我说的是否正确? 戴手套比不戴手套的准确率低10%,但当我计算手套没有覆盖的单词时,我发现只有10%的单词没有戴手套 import numpy as np import pandas as pd import unicode

最近,我使用手套来初始化嵌入的权重,我发现from_pretrained和weight.data.copy的区别在于:from_pretrained={weight.data.copy();Embedding.weight.requires_grad=False}
我想知道我说的是否正确?
戴手套比不戴手套的准确率低10%,但当我计算手套没有覆盖的单词时,我发现只有10%的单词没有戴手套

import numpy as np
import pandas as pd

import unicodedata, re, string
import nltk

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

import seaborn as sns

sns.set(color_codes=True)

df_train = pd.read_csv(r'E:\Mice\train.tsv', delimiter='\t')  # r代表反转义
df_test = pd.read_csv(r'E:\Mice\test.tsv', delimiter='\t')


def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def remove_numbers(words):
    """Remove all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub(r"\d+", "", word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    #    words = remove_stopwords(words)
    return words


df_train['Words'] = df_train['Phrase'].apply(nltk.word_tokenize)
df_train['Words'] = df_train['Words'].apply(normalize)
word_set = set()
for l in df_train['Words']:
    for e in l:
        word_set.add(e)

# 从这里开始构建词向量,简单的索引。
word_to_int = {word: ii for ii, word in enumerate(word_set, 1)}
df_train['Tokens'] = df_train['Words'].apply(lambda l: [word_to_int[word] for word in l])
max_len = df_train['Tokens'].str.len().max()
all_tokens = np.array([t for t in df_train['Tokens']])
encoded_labels = np.array([l for l in df_train['Sentiment']])
features = np.zeros((len(all_tokens), max_len), dtype=int)
# for each phrase, add zeros at the end
for i, row in enumerate(all_tokens):
    features[i, :len(row)] = row
# 到这里词向量构建结束

# 下面是另一种, 用glove预训练。
embeddings_index = {}
f = open('E:/Mice/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
# 同样属于glove, 构建词矩阵。
num = 0  # 计算下有多少数没有算进去。
tem_word = 0

embedding_matrix = np.zeros((len(word_to_int) + 1, 100))  # 因为预训练的glove就是100
for word, i in word_to_int.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        num = num + 1
        if num == 1:
            temp_word = i
# 要放在weight 中的。
print(embedding_matrix[tem_word])
print("未统计的数字:", num)
print("glove length", len(embeddings_index))
print("word length", len(word_to_int))

# split_frac = 0.8
# split_idx = int(len(features) * 0.8)
# train_x, remaining_x = features[:split_idx], features[split_idx:]
# train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
# test_idx = int(len(remaining_x) * 0.5)
# val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
# val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

split_frac = 0.8
split_idx = int(len(features) * 0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]
test_idx = int(len(remaining_x) * 0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
batch_size = 54
# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')


class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)  # input.shape = (54, 48)

        # embeddings and lstm_out
        x = x.long()  # 走的坑, 这里新增加了 .long()
        embeds = self.embedding(x)  # input 应该是: LongTensor of arbitrary shape containing the indices to extract
        # embeds.shape = (54, 48, 400)
        lstm_out, hidden = self.lstm(embeds, hidden)  # lstm_out.shape = (54, 48, 256)

        # transform lstm output to input size of linear layers
        lstm_out = lstm_out.transpose(0, 1)  # 48, 54, 256
        lstm_out = lstm_out[-1]  # lstm_out.shape = 54, 256

        out = self.dropout(lstm_out)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):
        """ Initializes hidden state """
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data  # 这里的shape是[16210, 400]  也是embedding的两个参数

        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
            # new 和 view、reshape的区别于何处, zero_ 还是初始化成了 0
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden


# Instantiate the model w/ hyperparams
# vocab_size = len(word_to_int) + 1  # +1 for the 0 padding
vocab_size = len(word_set) + 1
output_size = 5
embedding_dim = 100
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
net.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))

lr = 0.003

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# training params
epochs = 3  # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip = 5  # gradient clipping

# move model to GPU, if available
if train_on_gpu:
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        # inputs.shape = (54, 48)    # batch_size 就是54 . 48是 token的长度  # labels.shpe = 54, 每一项都是分类.
        counter += 1

        if train_on_gpu:
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)  # output.shape = [54, 5]
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.long())  # 走的坑, 这里新增加了 .squeeze() ; .long()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if train_on_gpu:
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.long())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e + 1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

test_losses = []  # track loss
num_correct = 0
# init hidden state
h = net.init_hidden(batch_size)
net.eval()
# iterate over test data
for inputs, labels in test_loader:
    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if train_on_gpu:
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output, h = net(inputs, h)
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.long())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class
    _, pred = torch.max(output, 1)

    # compare predictions to true label
    correct_tensor = pred.eq(labels.view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    # -- stats! -- ##
    # avg test loss
    print("Test loss: {:.3f}".format(np.mean(test_losses)))
    # accuracy over all test data
test_acc = num_correct / len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))
print(net.embedding.weight[tem_word])
数据集为,手套为glove.6B.100d.txt数据集为,手套为glove.6B.100d.txt