Keras word2vec模型-平均词向量-最佳方法
我遵循了word2vec教程,在R中使用了keras,并成功地获得了每个单词的单词嵌入。现在,我想将我的单词嵌入传递给每个Keras word2vec模型-平均词向量-最佳方法,r,keras,R,Keras,我遵循了word2vec教程,在R中使用了keras,并成功地获得了每个单词的单词嵌入。现在,我想将我的单词嵌入传递给每个文档,并将文档放入一些分类器模型,如logistic回归。使用下面的代码,我到达这一行: 行名称(嵌入矩阵) > embedding_matrix[1:5, 1:5] [,1] [,2] [,3] [,4] [,5] UNK 0.03696522 0.04
文档
,并将文档
放入一些分类器模型,如logistic回归。使用下面的代码,我到达这一行:
行名称(嵌入矩阵)
> embedding_matrix[1:5, 1:5]
[,1] [,2] [,3] [,4] [,5]
UNK 0.03696522 0.040100370 0.005650103 -0.015125941 -0.01914053
text 0.02520180 0.030634869 -0.003202755 -0.008647407 0.04315760
to 0.00604762 0.010533611 0.035768602 -0.027628880 0.01015684
the 0.01279589 -0.048598755 0.044444822 0.046858221 0.01908986
additional -0.02655490 -0.005862723 -0.002875999 -0.022304043 0.01380521
# Generate some sample data
text <- c("Because I could not stop for Death I add additional text-",
"He kindly stopped for me some additional text to act as a filler -",
"The Carriage held but just Ourselves more additional text to add to the body of the text-",
"and Immortality plus some more words to fill the text a little")
ID <- c(1,2,3,4)
output <- c(1,0,0,1)
df <- data.frame(cbind(ID, text, output))
df$text <- as.character(df$text)
# Initialise keras parameters
library(keras)
tokenizer <- text_tokenizer(num_words = 20)
tokenizer %>%
fit_text_tokenizer(df$text)
embedding_size <- 128 # Dimension of the embedding vector.
skip_window <- 5 # How many words to consider left and right.
num_sampled <- 1 # Number of negative examples to sample for each word.
input_target <- layer_input(shape = 1)
input_context <- layer_input(shape = 1)
embedding <- layer_embedding(
input_dim = tokenizer$num_words + 1,
output_dim = embedding_size,
input_length = 1,
name = "embedding"
)
target_vector <- input_target %>%
embedding() %>%
layer_flatten()
context_vector <- input_context %>%
embedding() %>%
layer_flatten()
dot_product <- layer_dot(list(target_vector, context_vector), axes = 1)
output <- layer_dense(dot_product, units = 1, activation = "sigmoid")
model <- keras_model(list(input_target, input_context), output)
model %>% compile(loss = "binary_crossentropy", optimizer = "adam")
summary(model)
######################## Broken skigrams_generator, have to define our own #######################
skipgrams_generator <- function(text, tokenizer, window_size, negative_samples) {
gen <- texts_to_sequences_generator(tokenizer, sample(text))
function() {
while(TRUE) {
nxt <- generator_next(gen)
if (length(nxt) > 1)
break
}
skip <- nxt %>%
skipgrams(
vocabulary_size = tokenizer$num_words,
window_size = window_size,
negative_samples = 1
)
x <- transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1))
y <- skip$labels %>% as.matrix(ncol = 1)
list(x, y)
}
}
########################################################
# Run the word2vec model
model %>%
fit_generator(
skipgrams_generator(df$text, tokenizer, skip_window, negative_samples),
steps_per_epoch = 2, epochs = 2
)
library(dplyr)
# Get the word embeddings
embedding_matrix <- get_weights(model)[[1]]
words <- data_frame(
word = names(tokenizer$word_index),
id = as.integer(unlist(tokenizer$word_index))
)
words <- words %>%
filter(id <= tokenizer$num_words) %>%
arrange(id)
row.names(embedding_matrix) <- c("UNK", words$word)
text <- c("Because I could not stop for Death I add additional text-",
"He kindly stopped for me some additional text to act as a filler -",
"The Carriage held but just Ourselves more additional text to add to the body of the text-",
"and Immortality plus some more words to fill the text a little")
ID <- c(1,2,3,4)
output <- c(1,0,0,1)
df <- data.frame(cbind(ID, text, output))
df$text <- as.character(df$text)
library(h2o)
h2o.init()
df <- as.h2o(df)
words <- h2o.tokenize(df$text, "\\\\W+") # Tokenize the words in the text
# apply the h2o word to vec model
w2v.model <- h2o.word2vec(words,
model_id = "w2v_model",
vec_size = 100,
word_model = c("SkipGram"),
min_word_freq = 1,
window_size = 5,
init_learning_rate = 0.025,
sent_sample_rate = 0,
epochs = 1)
# Avergae the word ebeddings using the h2o package
averaged_docs <- h2o.transform(w2v.model, words, aggregate_method = "AVERAGE")
# join the Y variable and X variables together
ML_data <- h2o.cbind(as.factor(df["output"]), averaged_docs)
averaged_docs <- as.data.frame(averaged_docs)
dim(ML_data)
# NOTE here the code doesn't work since I give the model too few observations (it does work on larger datasets, I include it
# here for completness)
gbm.model <- h2o.gbm(x = names(averaged_docs), y = "output",
training_frame = ML_data[1:101], validation_frame = NULL)