R 如何将`TF-IDF`与`LDA`函数一起使用而不出错_R_Nlp_Tf Idf_Lda

R 如何将`TF-IDF`与`LDA`函数一起使用而不出错

r nlp

R 如何将`TF-IDF`与`LDA`函数一起使用而不出错,r,nlp,tf-idf,lda,R,Nlp,Tf Idf,Lda,我有一个文档术语矩阵，我计划对其进行NLP分析。在使用的原始代码中，我为tf术语频率和DF文档频率定义了阈值，以删除一些不必要的单词提高计算速度所以，我定义了这样的东西： #create DTM dtm <- CreateDtm(tokens$clean_remark, doc_names = tokens$ML.., ngram_window = c(1, 2)) #explore the basic freq

我有一个文档术语矩阵，我计划对其进行NLP分析。在使用的原始代码中，我为tf术语频率和DF文档频率定义了阈值，以

删除一些不必要的单词提高计算速度所以，我定义了这样的东西：

#create DTM
dtm <- CreateDtm(tokens$clean_remark, 
                 doc_names = tokens$ML.., 
                 ngram_window = c(1, 2))
#explore the basic frequency
tf <- TermDocFreq(dtm = dtm)
original_tf <- tf %>% select(term, term_freq,doc_freq)
rownames(original_tf) <- 1:nrow(original_tf)

# Eliminate words appearing less than 350 times or in more than quarter of the
# documents
inds_vocabs = which( tf$term_freq > 350 & tf$doc_freq < nrow(dtm) / 4)
vocabulary <- tf$term[inds_vocabs]
dtm <- dtm[,inds_vocabs]

当我运行这段代码时，我得到了以下错误消息：LDAtext_矩阵中的错误，k=5，method=Gibbs，control=listseed=12345:DocumentTermMatrix需要一个术语频率权重

我搜索发现LDA函数不能与TF-IDF一起使用。以下是链接：

我知道我不能使用LDA，但如何使用TF-IDF进行主题建模？有其他解决方案吗？

我认为这段代码指的是这个URL正在使用tf idf和LDA。对不起，我是日本人，英语很差

text_matrix <- text_cleaning_tokens %>% count(ML.., word) %>% 
  cast_dtm(document = ML.., term=word, value = n, weighting = tm::weightTf)
#removeSparseTerms(text_matrix, sparse = 0.999)

lda_model <- LDA(text_matrix, k=5, method = 'Gibbs', control = list(seed=12345))

library(tidyverse)
library(tidytext)
library(janeaustenr)
library(topicmodels)
library(LDAvis)

# Read Data
# Data is splited to words from text
df <- read.csv("tests/テキストマイニング分解後処理済_2106181431.csv"
               ,fileEncoding = "cp932",stringsAsFactors = F)

# count words grouped by text
mybook_words <- df %>% rename(book = テキスト,word=単語) %>% 
  select(book,word) %>% group_by(book,word) %>% 
  summarise(n = n()) %>% ungroup()

# calculate tf-idf
mybook_words2 <- mybook_words %>% bind_tf_idf(word,book,n) %>% 
  select(book,word,tf_idf,n) 

# filter low tf-idf words
# mybook_words3 <- mybook_words2 %>% filter(tf_idf > 0.5) 
# make DTM
# dtm_long <- mybook_words3 %>% cast_dtm(book, word, n)

# weight n by tf-idf
mybook_words4 <- mybook_words2 %>% mutate(n2 = round(tf_idf * n * 10,0)) %>% 
  filter(n2 > 0)
# make DTM using n weighted by tf-idf
dtm_long <- mybook_words4 %>% cast_dtm(book, word, n2)

# execute LDA
lda_model_long_1 <- LDA(dtm_long,k = 3, control = list(seed = 1234))
result <- tidytext::tidy(lda_model_long_1, 'beta')

# top words of topics
ldaOut.terms <- as.matrix(terms(lda_model_long_1,6))
ldaOut.terms[1:6,]

# View result in shinyapp
topicmodels2LDAvis <- function(x, ...){
  post <- topicmodels::posterior(x)
  if (ncol(post[["topics"]]) < 3) stop("The model must contain > 2 topics")
  mat <- x@wordassignments
  LDAvis::createJSON(
    phi = post[["terms"]], 
    theta = post[["topics"]],
    vocab = colnames(post[["terms"]]),
    doc.length = slam::row_sums(mat, na.rm = TRUE),
    term.frequency = slam::col_sums(mat, na.rm = TRUE)
  )
}
serVis(topicmodels2LDAvis(lda_model_long_1))

# Visualise result
result %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>% 
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", ncol = 4) +
  coord_flip()