Tensorflow 使用brown数据集(NLTK)的词汇相似度skip-gramm(word2vec)模型的准确性

Tensorflow 使用brown数据集(NLTK)的词汇相似度skip-gramm(word2vec)模型的准确性,tensorflow,deep-learning,word2vec,Tensorflow,Deep Learning,Word2vec,我想基于NLTK库中的brown数据集创建相似性矩阵。问题在于损失 tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed, labels = y, num_sampled = num_sampled, num_classes = num_words)) 从4.2降低到2.0,然后开始上升和下降。 问题是

我想基于NLTK库中的brown数据集创建相似性矩阵。问题在于损失

tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))
从4.2降低到2.0,然后开始上升和下降。 问题是:如何提高模型的准确性

这是我的全部代码:

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Layer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice
import random
from itertools import repeat
import tensorflow as tf
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
import string
nltk.download('brown')
nltk.download('stopwords')


#Dataset loading and preparation:
dataset = brown.sents()

punct = list(string.punctuation)
punct.append("``")
punct.append("''")
punct.append("--")
stops = set(stopwords.words("english")) 

dataset = [[word.lower() for word in sentence if word not in punct and word.lower() not in stops] for sentence in dataset] 


#tokenization
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(dataset)

word2index = tokenizer.word_index
index_word = tokenizer.index_word

total_words = 5000

data_prep = tokenizer.texts_to_sequences(dataset) 
data_prep = [sentence for sentence in data_prep if len(sentence) >2] 

#word2vec
def word2vec_preparation(data,window_size,num_skips):
    grams = []
    context = []
    target = []

    assert window_size >= 1,'windows_size argument is <1!'
    assert num_skips >= 1,'num_skips argument <1!'
    for sentence in data:
        if len(sentence) - window_size > 1:
            #print(sentence)

            for i in range(len(sentence)):
                if i - window_size < 0:
                    gram = sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                elif i + window_size > len(sentence) -1:
                    gram = sentence[i-window_size:i]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                else:
                    gram = sentence[i-window_size:i] + sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

        #print('----------------------')

    return grams, context, target

grams,context,target = word2vec_preparation(data_prep,window_size = 2,num_skips = 3)

target = np.array(target,dtype= np.int64)
context = np.array(context,dtype= np.int64)


context = context.reshape(len(context),1)
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)

#Parameters:
num_words = 5000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

@tf.function
def training(X,y):
  with tf.GradientTape() as tape:
    embed = tf.nn.embedding_lookup(embeddings_weight,X)#embeddings_weight are parameters and X is a collection of indecies for looking up in the embedding table
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))
  variables = [embeddings_weight,softmax_weight,softmax_bias]  
  gradients = tape.gradient(loss,variables)
  optimizer.apply_gradients(zip(gradients,variables))
  return loss
  #tf.print('Loss:',loss)



EPOCHS = 100

for epoch in range(EPOCHS):
  for step, (X,y) in enumerate(dataset_train):
    loss = training(X,y)
  tf.print('Epoch:',epoch + 1, 'loss:',loss)
将熊猫作为pd导入
将numpy作为np导入
从tensorflow.keras.preprocessing.sequence导入pad_序列
从tensorflow.keras.layers导入嵌入,层
来自tensorflow.keras导入模型
来自tensorflow.keras.preprocessing.text导入标记器
从numpy.random导入选择
随机输入
从itertools导入重复
导入tensorflow作为tf
导入nltk
进口稀土
从nltk.corpus导入停止词
从nltk.corpus导入布朗
导入字符串
nltk.download('brown')
nltk.download('stopwords'))
#数据集加载和准备:
dataset=brown.sents()
点=列表(字符串.标点符号)
点状追加(“``”)
点状追加(“”)
点号追加(“--”)
stops=set(stopwords.words(“英语”))
dataset=[[word.lower()表示单词在句子中,如果单词不在点状符号中,则表示单词在句子中;word.lower()表示单词在停止符号中,则表示单词在句子中]
#标记化
标记器=标记器(num_words=5000)
标记器.fit_on_文本(数据集)
word2index=tokenizer.word\u索引
index\u word=tokenizer.index\u word
总字数=5000
data\u prep=tokenizer.text\u to\u序列(数据集)
数据准备=[如果len(句子)>2,数据准备中的句子对句子]
#word2vec
def word2vec准备(数据、窗口大小、跳过次数):
克=[]
上下文=[]
目标=[]

断言window\u size>=1,'windows\u size参数是报告的损失不是模型有用性的黄金标准–根据实际使用情况对结果词向量进行实际测试是

而且,具有损耗平台(然后上下抖动)是这种优化的自然和预期行为。(该模型在预测训练数据方面永远不会完美,除非它的数据量过大——在这种情况下,它将“过度拟合”训练数据,并可能在实际任务中表现不佳。)在给定特定算法和模型参数的情况下,你希望每个人的损失尽可能低,而不是0.0

您的代码中可能还有其他错误,我还没有检查过。我建议使用现成的、经过调试的
Word2Vec
实现——或者直接使用,或者,如果使用自己的实现真的很重要(可能是为了学习),作为判断代码是否正常工作的基准