使用Python PDFMiner将多个PDF提取到文本文件的循环脚本_Python_Loops_Pdf_Pdfminer

使用Python PDFMiner将多个PDF提取到文本文件的循环脚本

python loops pdf

使用Python PDFMiner将多个PDF提取到文本文件的循环脚本,python,loops,pdf,pdfminer,Python,Loops,Pdf,Pdfminer,谢谢你的帮助。我发现这个示例脚本用于将PDF提取到文本文件：这是有效的，这可能是我发现的最准确的提取方法。我想对它进行编辑，使其在多个PDF中循环，并将它们写入多个文本文件，所有这些文件都与创建它们的PDF同名。我正在努力这样做，要么只写一个文本文件，要么覆盖我试图提取的PDF文件。有人能帮我做一个循环，在一个文件夹中循环所有PDF文件，并将它们提取到与PDF同名的单个文本文件中吗提前感谢您的帮助 import os from pdfminer.pdfparser impo

谢谢你的帮助。我发现这个示例脚本用于将PDF提取到文本文件：

这是有效的，这可能是我发现的最准确的提取方法。我想对它进行编辑，使其在多个PDF中循环，并将它们写入多个文本文件，所有这些文件都与创建它们的PDF同名。我正在努力这样做，要么只写一个文本文件，要么覆盖我试图提取的PDF文件。有人能帮我做一个循环，在一个文件夹中循环所有PDF文件，并将它们提取到与PDF同名的单个文本文件中吗

提前感谢您的帮助

    import os
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    # From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice
    # Import this to raise exception whenever text extraction from PDF is not allowed
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.layout import LAParams, LTTextBox, LTTextLine
    from pdfminer.converter import PDFPageAggregator

    base_path = "C://some_folder"

    my_file = os.path.join(base_path + "/" + "test_pdf.pdf")
    log_file = os.path.join(base_path + "/" + "pdf_log.txt")

    password = ""
    extracted_text = ""

    # Open and read the pdf file in binary mode
    fp = open(my_file, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device 
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
         # As the interpreter processes the page stored in PDFDocument object
         interpreter.process_page(page)
         # The device renders the layout from interpreter
         layout = device.get_result()
         # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
         for lt_obj in layout:
             if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                   extracted_text += lt_obj.get_text()
        
    #close the pdf file
    fp.close()

    # print (extracted_text.encode("utf-8"))
        
    with open(log_file, "wb") as my_log:
        my_log.write(extracted_text.encode("utf-8"))
    print("Done !!")

脚本作者在开始时使用两个参数指定输入和输出文件：

my\u file

和

log\u file

您可以将脚本转换为将这些作为输入并执行提取的函数，然后多次循环此函数

# import statemates as in the original script
base_path = "C://some_folder"

# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")

# This is called a list comprehension, it takes each of the 
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]

# Function to extract the file
def extract(my_file,log_file):
    # code to perform the file extraction as in the original script

# loop through the file names, 
# as we have two list, use a range of indices instead of for name in my_files 
for i in range(len(my_files)):
    extract(my_files[i],log_files[i])

您还应该检查

os.path.join

的文档，因为您的用法不是最佳做法（切换操作系统时可能会中断）。

脚本作者在开始时使用两个参数指定输入和输出文件：

my\u file

和

log\u file

您可以将脚本转换为将这些作为输入并执行提取的函数，然后多次循环此函数

# import statemates as in the original script
base_path = "C://some_folder"

# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")

# This is called a list comprehension, it takes each of the 
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]

# Function to extract the file
def extract(my_file,log_file):
    # code to perform the file extraction as in the original script

# loop through the file names, 
# as we have two list, use a range of indices instead of for name in my_files 
for i in range(len(my_files)):
    extract(my_files[i],log_files[i])

您还应该检查

os.path.join

的文档，因为您的使用不是最佳做法（切换操作系统时可能会中断）。

假设您具有以下目录结构：

script.py
PDF
├─a、 pdf
├─b、 pdf
└─c、 pdf
txts

其中

script.py

是Python脚本，

pdfs

是包含PDF文档的文件夹，

txts

是提取的文本文件应该放在的空文件夹

我们可以使用

pathlib.Path.glob

来发现给定目录中所有PDF文档的路径。我们对路径进行迭代，对于每个路径，我们打开相应的PDF文档，对其进行解析，提取文本，并将文本保存在

txts

文件夹中的文本文档（同名）中

def main():

    from pathlib import Path

    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.layout import LAParams, LTTextBox, LTTextLine
    from pdfminer.converter import PDFPageAggregator

    for path in Path("pdfs").glob("*.pdf"):
        with path.open("rb") as file:
            parser = PDFParser(file)
            document = PDFDocument(parser, "")
            if not document.is_extractable:
                continue

            manager = PDFResourceManager()
            params = LAParams()

            device = PDFPageAggregator(manager, laparams=params)
            interpreter = PDFPageInterpreter(manager, device)

            text = ""

            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                for obj in device.get_result():
                    if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
                        text += obj.get_text()
        with open("txts/{}.txt".format(path.stem), "w") as file:
            file.write(text)
    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())