如何通过R将目录中的所有PDF转换为txt格式?

如何通过R将目录中的所有PDF转换为txt格式?,pdf,text,transform,mining,Pdf,Text,Transform,Mining,我正在尝试将位于我的计算机目录中的PDF文件列表转换为txt格式,以便R可以读取它并开始文本挖掘。你知道这个代码有什么问题吗 library(tm) #load text mining library setwd('D:/Directory') #sets R's working directory to near where my files are ae.corpus<-Corpus(DirSource("D:/Directory/NewsArticles"),readerContro

我正在尝试将位于我的计算机目录中的PDF文件列表转换为txt格式,以便R可以读取它并开始文本挖掘。你知道这个代码有什么问题吗

library(tm) #load text mining library
setwd('D:/Directory') #sets R's working directory to near where my files are
ae.corpus<-Corpus(DirSource("D:/Directory/NewsArticles"),readerControl=list(reader=readPlain))
exe <- "C:\\Program Files\\xpdfbin-win-3.03\\bin32\\pdftotext.exe"
system(paste("\"", exe, "\" \"", ae.corpus, "\"", sep = ""), wait = F)
filetxt <- sub(".pdf", ".txt", dest)
shell.exec(filetxt); shell.exec(filetxt)    # strangely the first try always throws an error..

summary(ae.corpus) #check what went in
ae.corpus <- tm_map(ae.corpus, tolower)
ae.corpus <- tm_map(ae.corpus, removePunctuation)
ae.corpus <- tm_map(ae.corpus, removeNumbers)
myStopwords <- c(stopwords('english'), "available", "via")
ae.corpus <- tm_map(ae.corpus, removeWords, myStopwords) # this stopword file is at C:\Users\[username]\Documents\R\win-library\2.13\tm\stopwords 


ae.tdm <- DocumentTermMatrix(ae.corpus, control = list(minWordLength = 3))
inspect(ae.tdm)
findFreqTerms(ae.tdm, lowfreq=2)
findAssocs(ae.tdm, "economic",.7)
d<- Dictionary (c("economic", "uncertainty", "policy"))
inspect(DocumentTermMatrix(ae.corpus, list(dictionary = d)))
library(tm)#加载文本挖掘库
setwd('D:/Directory')#将R的工作目录设置为接近我的文件所在的位置

ae.corpus试着用这个来代替:

dest <- ""           #same as setwd()
myfiles <- list.files(path = dest, pattern = "pdf",  full.names = TRUE)
# convert each PDF file that is named in the vector into a text file
# text file is created in the same directory as the PDFs
lapply(myfiles, function(i) system(paste('""',    #the path to Program files where the pdftotext.exe is saved
                                     paste0('"', i, '"')), wait = FALSE) )
dest
#combine files
files <- list.files(pattern = "[.]txt$")
outFile <- file("output.txt", "w") 
for (i in files){ 
x <- readLines(i) 
writeLines(x[2:(length(x)-1)], outFile) 
} 
close(outFile) 

#read data
txt<-read.table('output.txt',sep='\t', quote = "")