使用PDF工具/tesseract加速关键字查找器的代码
我是论坛的热心读者,我正在R学习编程 我想了解您对使用tesseract在PDF文件列表中搜索关键字的一些代码的反馈。我想发布完整的代码,以防有人发现它有用或想对它有所贡献 目前代码相当慢,扫描一个7页的PDF需要3/4分钟。我想知道我是否可以做些什么来加速代码。我注意到的一些瓶颈是:使用PDF工具/tesseract加速关键字查找器的代码,r,tesseract,tokenize,rmagick,pdftools,R,Tesseract,Tokenize,Rmagick,Pdftools,我是论坛的热心读者,我正在R学习编程 我想了解您对使用tesseract在PDF文件列表中搜索关键字的一些代码的反馈。我想发布完整的代码,以防有人发现它有用或想对它有所贡献 目前代码相当慢,扫描一个7页的PDF需要3/4分钟。我想知道我是否可以做些什么来加速代码。我注意到的一些瓶颈是: 必须创建每页的图像并将其保存在文件夹中,以便magick能够读取 有多个嵌套循环用于生成摘要(通过kewords、页面和文档循环) 谢谢大家! 必需的包####################### 图书馆(
- 必须创建每页的图像并将其保存在文件夹中,以便magick能够读取
- 有多个嵌套循环用于生成摘要(通过kewords、页面和文档循环)
必需的包#######################
图书馆(plyr)
图书馆(dplyr)
图书馆(主任)
图书馆(stringr)
图书馆(tesseract)
图书馆(pdftools)
图书馆(阅览室)
图书馆(magick)
#################工作目录#######################
setwd(文件夹)
#################关键词搜索选项#################
#使用csv关键字列表
关键词
################ REQUIRED PACKAGES #######################
library(plyr)
library(dplyr)
library(officer)
library(stringr)
library(tesseract)
library(pdftools)
library(readOffice)
library(magick)
################# WORKING DIRECTORY #######################
setwd(folder)
################# KEY WORD SEARCH OPTIONS #################
#Use csv list of keywords
keywords <- as.character(read.csv("keywords.csv")[,1]) #lists of keywords
################# PDF CONVERSION AND TEXT EXTRACTION ####################################
#List the files in search folder
pdf_files <- list.files(path=folder, full.names = TRUE, pattern= "?.pdf") #reads all the names of the pdf files saved in folder.
#Converts the pdf to images using pdftools and saves all the images in a list
pdf_pages_l <- list() #initialize the list of PDF pages per pdf
pdf_images_l <- list() #initialize the list of images per pdf (1 image per pdf page)
setwd("./images")
for (pdf in 1:length(pdf_files)) {
pdf_pages_l[[pdf]] <- pdftools::pdf_convert(pdf_files[pdf], dpi = 600) #creates one image per pdf page and saves the image path in the pdf_pages_l.
}
for (pdf in 1:length(pdf_files)) {
pdf_images_l[[pdf]] <- lapply(pdf_pages_l[[pdf]], FUN = image_read) #loads the images saved in the previous step using magick
}
for (pdf in 1:length(pdf_images_l)) { #applies formatting changes to make the images more readable by tesseract (using suggestions from https://shiring.github.io/text_analysis/2017/07/17/ocr_tesseract)
pdf_images_l[[pdf]] <- lapply(pdf_images_l[[pdf]], FUN = function(image){
image_scale(image, "x2000") %>% # rescale
image_background("white", flatten = TRUE) %>% # set background to white
image_trim() %>% # Trim edges that are the background color from the image.
image_noise() %>% # Reduce noise in image using a noise peak elimination filter
image_enhance() %>% # Enhance image (minimize noise)
image_normalize() %>% # Normalize image (increase contrast by normalizing the pixel values to span the full range of color values).
image_contrast(sharpen = 1) # Increase contrast
})
}
#iteration through documents and images to produce summary report
final_output = data.frame(matrix(ncol=4, nrow=0))
colnames(final_output) = c("file_name","text","page","keyword")
page_content = final_output #initialization of page_content
doc_content = final_output #initialization of doc_content
relevant_content = final_output #initialization of relevant_content
for (pdf in 1:length(pdf_files)) {
for(image in 1:length(pdf_images_l[[pdf]])) {
content <- ocr(pdf_images_l[[pdf]][[image]], #reads text in image and separates sentences (1 row per sentence)
engine = tesseract(language = "eng")) %>% tokenize_sentences()
content_df <- data.frame(content[[1]]) #converts output to a data frame of 1 col
colnames(content_df) = "text" #adds a name to the column
for(k in keywords){
k_match <- filter(content_df, str_detect(content_df[,1], fixed(k, ignore_case = "T"))) #looks for keywords in the data frame and ignores case
k_match$keyword <- rep(k, nrow(k_match)) #add a column with the keyword that was found
k_match$text <- as.character(k_match$text) #converts the 'text' to a character vector (instead of factor)
page_content <- rbind.fill(page_content, k_match) #add the records to the existing list of keywords found in that page
}
page_content$page <- rep(image,nrow(page_content)) #add the page number were the previous keywords were found
doc_content <- rbind.fill(doc_content, page_content) #add page info into the document info
page_content <- final_output #re-initializes page content before moving to the next page (final_output is still an empty df)
}
doc_content$file_name <- rep(sub("_1.*", "", pdf_pages_l[[pdf]][1]),nrow(doc_content)) #add a file name to the document
relevant_content <- rbind.fill(doc_content, relevant_content) #add the document content to the final output
doc_content <- final_output #re-initializes document content before moving to the next document (final_output is still an empty df)
}
final_output <- relevant_content #populate final_ouput with the results
all_images <- list.files(getwd(),full.names = TRUE) #list all the images created
do.call(file.remove, list(list.files(getwd(),full.names = TRUE))) #removes all the images created
################# OUTPUT PRINT OUT ##########################
setwd(folder) #goes back to parent folder
print(final_output) #prints output
write.csv(final_output, 'pdf_output.csv') #overwrites existing csv file