使用PDF工具/tesseract加速关键字查找器的代码

使用PDF工具/tesseract加速关键字查找器的代码,r,tesseract,tokenize,rmagick,pdftools,R,Tesseract,Tokenize,Rmagick,Pdftools,我是论坛的热心读者,我正在R学习编程 我想了解您对使用tesseract在PDF文件列表中搜索关键字的一些代码的反馈。我想发布完整的代码,以防有人发现它有用或想对它有所贡献 目前代码相当慢,扫描一个7页的PDF需要3/4分钟。我想知道我是否可以做些什么来加速代码。我注意到的一些瓶颈是: 必须创建每页的图像并将其保存在文件夹中,以便magick能够读取 有多个嵌套循环用于生成摘要(通过kewords、页面和文档循环) 谢谢大家! 必需的包####################### 图书馆(

我是论坛的热心读者,我正在R学习编程

我想了解您对使用tesseract在PDF文件列表中搜索关键字的一些代码的反馈。我想发布完整的代码,以防有人发现它有用或想对它有所贡献

目前代码相当慢,扫描一个7页的PDF需要3/4分钟。我想知道我是否可以做些什么来加速代码。我注意到的一些瓶颈是:

  • 必须创建每页的图像并将其保存在文件夹中,以便magick能够读取
  • 有多个嵌套循环用于生成摘要(通过kewords、页面和文档循环)
谢谢大家!

必需的包#######################
图书馆(plyr)
图书馆(dplyr)
图书馆(主任)
图书馆(stringr)
图书馆(tesseract)
图书馆(pdftools)
图书馆(阅览室)
图书馆(magick)
#################工作目录#######################
setwd(文件夹)
#################关键词搜索选项#################
#使用csv关键字列表
关键词
################ REQUIRED PACKAGES #######################

library(plyr)
library(dplyr)
library(officer)
library(stringr)
library(tesseract)
library(pdftools)
library(readOffice)
library(magick)

################# WORKING DIRECTORY #######################

setwd(folder)


################# KEY WORD SEARCH OPTIONS #################

#Use csv list of keywords

keywords <- as.character(read.csv("keywords.csv")[,1]) #lists of keywords


################# PDF CONVERSION AND TEXT EXTRACTION ####################################

#List the files in search folder

pdf_files <- list.files(path=folder, full.names = TRUE, pattern= "?.pdf") #reads all the names of the pdf files saved in folder. 
                          
#Converts the pdf to images using pdftools and saves all the images in a list 

pdf_pages_l <- list() #initialize the list of PDF pages per pdf
pdf_images_l <- list() #initialize the list of images  per pdf (1 image per pdf page)

setwd("./images")

for (pdf in 1:length(pdf_files)) {
  pdf_pages_l[[pdf]] <- pdftools::pdf_convert(pdf_files[pdf], dpi = 600) #creates one image per pdf page and saves the image path in the pdf_pages_l.
  
}

for (pdf in 1:length(pdf_files)) {

  pdf_images_l[[pdf]] <- lapply(pdf_pages_l[[pdf]], FUN = image_read) #loads the images saved in the previous step using magick

}

for (pdf in 1:length(pdf_images_l)) { #applies formatting changes to make the images more readable by tesseract (using suggestions from https://shiring.github.io/text_analysis/2017/07/17/ocr_tesseract)
  
      pdf_images_l[[pdf]] <- lapply(pdf_images_l[[pdf]], FUN = function(image){
      image_scale(image, "x2000") %>%                 # rescale
      image_background("white", flatten = TRUE) %>%   # set background to white
      image_trim() %>%                                # Trim edges that are the background color from the image.
      image_noise() %>%                               # Reduce noise in image using a noise peak elimination filter
      image_enhance() %>%                             # Enhance image (minimize noise)
      image_normalize() %>%                           # Normalize image (increase contrast by normalizing the pixel values to span the full range of color values).
      image_contrast(sharpen = 1)                     # Increase contrast
    
  })
  
}

#iteration through documents and images to produce summary report

final_output = data.frame(matrix(ncol=4, nrow=0))
colnames(final_output) = c("file_name","text","page","keyword")
page_content = final_output #initialization of page_content 
doc_content = final_output  #initialization of doc_content
relevant_content = final_output #initialization of relevant_content
                                                     

for (pdf in 1:length(pdf_files)) {
  for(image in 1:length(pdf_images_l[[pdf]])) { 

    content <- ocr(pdf_images_l[[pdf]][[image]], #reads text in image and separates sentences (1 row per sentence)
                   engine = tesseract(language = "eng")) %>% tokenize_sentences()
  
    content_df <- data.frame(content[[1]]) #converts output to a data frame of 1 col
    colnames(content_df) = "text" #adds a name to the column
    
        for(k in keywords){
                k_match <- filter(content_df, str_detect(content_df[,1], fixed(k, ignore_case = "T"))) #looks for keywords in the data frame and ignores case
                k_match$keyword <- rep(k, nrow(k_match)) #add a column with the keyword that was found
                k_match$text <- as.character(k_match$text) #converts the 'text' to a character vector (instead of factor)
                page_content <- rbind.fill(page_content, k_match) #add the records to the existing list of keywords found in that page
              
                }
          
      
    page_content$page <- rep(image,nrow(page_content)) #add the page number were the previous keywords were found
    doc_content <- rbind.fill(doc_content, page_content) #add page info into the document info
    page_content <- final_output #re-initializes page content  before moving to the next page (final_output is still an empty df)
        
    }
    
  
  doc_content$file_name <- rep(sub("_1.*", "", pdf_pages_l[[pdf]][1]),nrow(doc_content)) #add a file name to the document
  relevant_content <- rbind.fill(doc_content, relevant_content)  #add the document content to the final output
  doc_content <- final_output #re-initializes  document content before moving to the next document (final_output is still an empty df)
  
}

final_output <- relevant_content #populate final_ouput with the results

all_images <- list.files(getwd(),full.names = TRUE)  #list all the images created
do.call(file.remove, list(list.files(getwd(),full.names = TRUE))) #removes all the images created


################# OUTPUT PRINT OUT ##########################

setwd(folder) #goes back to parent folder
print(final_output) #prints output
write.csv(final_output, 'pdf_output.csv') #overwrites existing csv file