Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/64.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在R中使用lexicalize()和lda.collapsed.gibbs.sampler()时出错_R_Tm_Lda_Topic Modeling - Fatal编程技术网

在R中使用lexicalize()和lda.collapsed.gibbs.sampler()时出错

在R中使用lexicalize()和lda.collapsed.gibbs.sampler()时出错,r,tm,lda,topic-modeling,R,Tm,Lda,Topic Modeling,我不熟悉主题建模,当时正在测试lda.collapsed.gibbs.sampler()方法,试图“表征”一些98个CV。我第一次尝试使用一个语料库(因为它更容易过滤等),但是这会产生一些意想不到的结果-可能是因为lexicalize()函数首先将其转换为一个只有3个文档/对象的对象 # method 1 a <-Corpus(DirSource(doc.folder,pattern = ".txt$"), readerControl = list(language="eng"))

我不熟悉主题建模,当时正在测试lda.collapsed.gibbs.sampler()方法,试图“表征”一些98个CV。我第一次尝试使用一个语料库(因为它更容易过滤等),但是这会产生一些意想不到的结果-可能是因为lexicalize()函数首先将其转换为一个只有3个文档/对象的对象

# method 1
a  <-Corpus(DirSource(doc.folder,pattern = ".txt$"), readerControl =   list(language="eng"))
a <- tm_map(a, content_transformer(removeNumbers))
a <- tm_map(a, content_transformer(removePunctuation))
a <- tm_map(a, content_transformer(stripWhitespace))
a <- tm_map(a, content_transformer(tolower))

lex <- lexicalize(a)
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
length(a) # output: [1] 98
length(lex$documents) # output: [1] 3, even though I expect 98
dim(result$document_sums) # output: [1] 8 3  even though I expect 8 98
#方法1
一
# method 2
filenames = list.files(path=doc.folder,pattern=".txt$",full.names = T)

df <- data.frame(stringsAsFactors=FALSE)
for (filename in filenames){
  myfile = file(filename)
  df <- rbind(df,cbind(name=file_path_sans_ext(basename(filename)),text=paste(readLines(myfile),collapse=" "))) 
  close(myfile)
}
# the following avoids an error due to french words etc being used
df[,"text"] <- sapply(df[,"text"],iconv,"WINDOWS-1252","UTF-8")

lex <- lexicalize(df[,"text"])
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
NROW(df) # output: [1] 98
length(lex$documents) # output: [1] 98 as expected
dim(result$document_sums) # output: [1] 8 98 as expected