R：将TIBLES转换为术语文档矩阵_R_Text_Nlp_Text Mining_Term Document Matrix

R：将TIBLES转换为术语文档矩阵

r text nlp

R：将TIBLES转换为术语文档矩阵,r,text,nlp,text-mining,term-document-matrix,R,Text,Nlp,Text Mining,Term Document Matrix,我正在使用R编程语言。我学习了如何从互联网上获取pdf文件并将其加载到R中。例如，下面我将莎士比亚的3本不同的书加载到R中： library(pdftools) library(tidytext) library(textrank) library(tm) #1st document url <- "https://shakespeare.folger.edu/downloads/pdf/hamlet_PDF_FolgerShakespeare.pdf" artic

我正在使用R编程语言。我学习了如何从互联网上获取pdf文件并将其加载到R中。例如，下面我将莎士比亚的3本不同的书加载到R中：

library(pdftools)
library(tidytext)
library(textrank)
library(tm)

#1st document
url <- "https://shakespeare.folger.edu/downloads/pdf/hamlet_PDF_FolgerShakespeare.pdf"

article <- pdf_text(url)
article_sentences <- tibble(text = article) %>%
  unnest_tokens(sentence, text, token = "sentences") %>%
  mutate(sentence_id = row_number()) %>%
  select(sentence_id, sentence)


article_words <- article_sentences %>%
  unnest_tokens(word, sentence)


article_words_1 <- article_words %>%
  anti_join(stop_words, by = "word")

#2nd document
url <- "https://shakespeare.folger.edu/downloads/pdf/macbeth_PDF_FolgerShakespeare.pdf"

article <- pdf_text(url)
article_sentences <- tibble(text = article) %>%
  unnest_tokens(sentence, text, token = "sentences") %>%
  mutate(sentence_id = row_number()) %>%
  select(sentence_id, sentence)


article_words <- article_sentences %>%
  unnest_tokens(word, sentence)


article_words_2<- article_words %>%
  anti_join(stop_words, by = "word")


#3rd document
url <- "https://shakespeare.folger.edu/downloads/pdf/othello_PDF_FolgerShakespeare.pdf"

article <- pdf_text(url)
article_sentences <- tibble(text = article) %>%
  unnest_tokens(sentence, text, token = "sentences") %>%
  mutate(sentence_id = row_number()) %>%
  select(sentence_id, sentence)


article_words <- article_sentences %>%
  unnest_tokens(word, sentence)


article_words_3 <- article_words %>%
  anti_join(stop_words, by = "word")

有人能告诉我我做错了什么吗

谢谢

正如错误消息所示，

VectorSource

只接受一个参数。您可以将数据集

rbind

一起传递给

VectorSource

函数

library(tm)

tdm <- TermDocumentMatrix(Corpus(VectorSource(rbind(article_words_1, article_words_2, article_words_3))))
inspect(tdm)

#<<TermDocumentMatrix (terms: 14952, documents: 2)>>
#Non-/sparse entries: 14952/14952
#Sparsity           : 50%
#Maximal term length: 25
#Weighting          : term frequency (tf)
#Sample             :
#            Docs
#Terms        1     2
#  "act",     0   397
#  "cassio",  0   258
#  "ftln",    0 10303
#  "hamlet",  0   617
#  "iago",    0   371
#  "lord",    0   355
#  "macbeth", 0   386
#  "othello", 0   462
#  "sc",      0   337
#  "thou",    0   346

library（tm）
tdm感谢您的回答@RonakShah！你能看一下这个问题吗？如果你有时间，你能看一下这个问题吗？谢谢你好，Ronak Shah，如果你有时间，你能看看这个问题吗？谢谢
Error in VectorSource(article_words_1, article_words_2, article_words_3) : 
  unused arguments (article_words_2, article_words_3)

library(tm)

tdm <- TermDocumentMatrix(Corpus(VectorSource(rbind(article_words_1, article_words_2, article_words_3))))
inspect(tdm)

#<<TermDocumentMatrix (terms: 14952, documents: 2)>>
#Non-/sparse entries: 14952/14952
#Sparsity           : 50%
#Maximal term length: 25
#Weighting          : term frequency (tf)
#Sample             :
#            Docs
#Terms        1     2
#  "act",     0   397
#  "cassio",  0   258
#  "ftln",    0 10303
#  "hamlet",  0   617
#  "iago",    0   371
#  "lord",    0   355
#  "macbeth", 0   386
#  "othello", 0   462
#  "sc",      0   337
#  "thou",    0   346