Keras word2vec模型-平均词向量-最佳方法_R_Keras

Keras word2vec模型-平均词向量-最佳方法

r keras

Keras word2vec模型-平均词向量-最佳方法,r,keras,R,Keras,我遵循了word2vec教程，在R中使用了keras，并成功地获得了每个单词的单词嵌入。现在，我想将我的单词嵌入传递给每个文档，并将文档放入一些分类器模型，如logistic回归。使用下面的代码，我到达这一行：行名称（嵌入矩阵） > embedding_matrix[1:5, 1:5] [,1] [,2] [,3] [,4] [,5] UNK 0.03696522 0.04

我遵循了word2vec教程，在R中使用了keras，并成功地获得了每个单词的单词嵌入。现在，我想将我的单词嵌入传递给每个

文档

，并将

文档

放入一些分类器模型，如logistic回归。使用下面的代码，我到达这一行：

行名称（嵌入矩阵）
> embedding_matrix[1:5, 1:5]
                  [,1]         [,2]         [,3]         [,4]        [,5]
UNK         0.03696522  0.040100370  0.005650103 -0.015125941 -0.01914053
text        0.02520180  0.030634869 -0.003202755 -0.008647407  0.04315760
to          0.00604762  0.010533611  0.035768602 -0.027628880  0.01015684
the         0.01279589 -0.048598755  0.044444822  0.046858221  0.01908986
additional -0.02655490 -0.005862723 -0.002875999 -0.022304043  0.01380521

# Generate some sample data

text <- c("Because I could not stop for Death I add additional text-",
          "He kindly stopped for me some additional text to act as a filler -",
          "The Carriage held but just Ourselves more additional text to add to the body of the text-",
          "and Immortality plus some more words to fill the text a little")

ID <- c(1,2,3,4)
output <- c(1,0,0,1)
df <- data.frame(cbind(ID, text, output))
df$text <- as.character(df$text)

# Initialise keras parameters
library(keras)
tokenizer <- text_tokenizer(num_words = 20)
tokenizer %>% 
  fit_text_tokenizer(df$text)


embedding_size <- 128  # Dimension of the embedding vector.
skip_window <- 5       # How many words to consider left and right.
num_sampled <- 1       # Number of negative examples to sample for each word.

input_target <- layer_input(shape = 1)
input_context <- layer_input(shape = 1)

embedding <- layer_embedding(
  input_dim = tokenizer$num_words + 1, 
  output_dim = embedding_size, 
  input_length = 1, 
  name = "embedding"
)

target_vector <- input_target %>% 
  embedding() %>% 
  layer_flatten()

context_vector <- input_context %>%
  embedding() %>%
  layer_flatten()

dot_product <- layer_dot(list(target_vector, context_vector), axes = 1)
output <- layer_dense(dot_product, units = 1, activation = "sigmoid")

model <- keras_model(list(input_target, input_context), output)
model %>% compile(loss = "binary_crossentropy", optimizer = "adam")

summary(model)



######################## Broken skigrams_generator, have to define our own #######################

skipgrams_generator <- function(text, tokenizer, window_size, negative_samples) {
  gen <- texts_to_sequences_generator(tokenizer, sample(text))
  function() {

    while(TRUE) {
      nxt <- generator_next(gen)
      if (length(nxt) > 1)
        break
    }

    skip <- nxt %>%
      skipgrams(
        vocabulary_size = tokenizer$num_words, 
        window_size = window_size, 
        negative_samples = 1
      )
    x <- transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1))
    y <- skip$labels %>% as.matrix(ncol = 1)

    list(x, y)
  }
}

########################################################

# Run the word2vec model
model %>%
  fit_generator(
    skipgrams_generator(df$text, tokenizer, skip_window, negative_samples), 
    steps_per_epoch = 2, epochs = 2
  )

library(dplyr)

# Get the word embeddings
embedding_matrix <- get_weights(model)[[1]]

words <- data_frame(
  word = names(tokenizer$word_index), 
  id = as.integer(unlist(tokenizer$word_index))
)

words <- words %>%
  filter(id <= tokenizer$num_words) %>%
  arrange(id)

row.names(embedding_matrix) <- c("UNK", words$word)

text <- c("Because I could not stop for Death I add additional text-",
          "He kindly stopped for me some additional text to act as a filler -",
          "The Carriage held but just Ourselves more additional text to add to the body of the text-",
          "and Immortality plus some more words to fill the text a little")

ID <- c(1,2,3,4)
output <- c(1,0,0,1)
df <- data.frame(cbind(ID, text, output))
df$text <- as.character(df$text)

library(h2o)

h2o.init()
df <- as.h2o(df)
words <- h2o.tokenize(df$text, "\\\\W+")  # Tokenize the words in the text
# apply the h2o word to vec model
w2v.model <- h2o.word2vec(words,
                          model_id = "w2v_model",
                          vec_size = 100,
                          word_model = c("SkipGram"),
                          min_word_freq = 1,
                          window_size = 5,
                          init_learning_rate = 0.025,
                          sent_sample_rate = 0,
                          epochs = 1)
# Avergae the word ebeddings using the h2o package
averaged_docs <- h2o.transform(w2v.model, words, aggregate_method = "AVERAGE")
# join the Y variable and X variables together
ML_data <- h2o.cbind(as.factor(df["output"]), averaged_docs)

averaged_docs <- as.data.frame(averaged_docs)

dim(ML_data)
# NOTE here the code doesn't work since I give the model too few observations (it does work on larger datasets, I include it
# here for completness)
gbm.model <- h2o.gbm(x = names(averaged_docs), y = "output",
                     training_frame = ML_data[1:101], validation_frame = NULL)