Warning: file_get_contents(/data/phpspider/zhask/data//catemap/7/arduino/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 从pdf转换为文本:行和字已断开_Python_Python 3.x_Pypdf2_Pypdf - Fatal编程技术网

Python 从pdf转换为文本:行和字已断开

Python 从pdf转换为文本:行和字已断开,python,python-3.x,pypdf2,pypdf,Python,Python 3.x,Pypdf2,Pypdf,我想通过PyPDF2将pdf文件转换为文本,但转换后的文本看起来与pdf文件不同。具体来说,PDF中的一行在文本中被分成多行,单词也可能被打断。附件是PDF和文本文件,我得到的代码如下。有人能帮我解决这个问题吗 enter code here import PyPDF2 def extractPdfText(filePath=''): # Open the pdf file in read binary mode. fileObject = open(filePath, 'rb') # r

我想通过PyPDF2将pdf文件转换为文本,但转换后的文本看起来与pdf文件不同。具体来说,PDF中的一行在文本中被分成多行,单词也可能被打断。附件是PDF和文本文件,我得到的代码如下。有人能帮我解决这个问题吗

enter code here

import PyPDF2

def extractPdfText(filePath=''):

# Open the pdf file in read binary mode.
fileObject = open(filePath, 'rb') # rb

# Create a pdf reader .
pdfFileReader = PyPDF2.PdfFileReader(fileObject)

# Get total pdf page number.
totalPageNumber = pdfFileReader.numPages

# Print pdf total page number.
print('This pdf file contains totally ' + str(totalPageNumber) + ' pages.')

currentPageNumber = 0
text = ''

# Loop in all the pdf pages.
while(currentPageNumber < totalPageNumber ):

    # Get the specified pdf page object.
    pdfPage = pdfFileReader.getPage(currentPageNumber)

    # Get pdf page text.
    text = text + pdfPage.extractText()

    # Process next page.
    currentPageNumber += 1

    return text

pdfFilePath = 'PDF file path'

pdfText = extractPdfText(pdfFilePath)
在此处输入代码
导入PyPDF2
def extractPdfText(文件路径=“”):
#以读取二进制模式打开pdf文件。
fileObject=open(filePath,'rb')#rb
#创建一个pdf阅读器。
PdfileReader=PyPDF2.PdfileReader(文件对象)
#获取pdf总页码。
totalPageNumber=PdfileReader.numPages
#打印pdf总页码。
打印('此pdf文件总共包含'+str(totalPageNumber)+'页')
currentPageNumber=0
文本=“”
#循环所有pdf页面。
而(currentPageNumber

此答案使用encode('utf-8')将每页的输出保持在一起。我不知道您需要什么输出,因为您的问题中没有具体说明

from PyPDF2 import PdfFileReader

def pdf_text_extractor(path):
   with open(path, 'rb') as f:
     pdf = PdfFileReader(f)

     # Get total pdf page number.
     totalPageNumber = pdf.numPages

     currentPageNumber = 0

     while (currentPageNumber < totalPageNumber):
        page = pdf.getPage(currentPageNumber)

        text = page.extractText()
        # The encoding put each page on a single line.  
        # type is <class 'bytes'>
        print(text.encode('utf-8'))

        #################################
        # This outputs the text to a list,
        # but it doesn't keep paragraphs 
        # together 
        #################################
        # output = text.encode('utf-8')
        # split = str(output, 'utf-8').split('\n')
        # print (split)
        #################################

        # Process next page.
        currentPageNumber += 1

path = 'mypdf.pdf'
pdf_text_extractor(path)
这意味着提取与PDF格式文本完全相同的文本可能会有问题

您可以使用tika来完成此任务,但它也不会完全干净

from tika import parser

parse_entire_pdf = parser.from_file('mypdf.pdf', xmlContent=True)
parse_entire_pdf = parse_entire_pdf['content']
print (parse_entire_pdf)

真正的问题是——您打算如何使用提取的文本?

这是我将如何做到的

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = io.StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
    if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf 
            text = convert(pdfFilename) #get string of text content of pdf
            textFilename = txtDir + pdf + ".txt"
            textFile = open(textFilename, "w") #make text file
            textFile.write(text) #write text to text file

# set paths accordingly:
pdfDir = "C:/your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)

谢谢,生活是复杂的。我想得到类似pdf的输出,这意味着pdf中的一行保存在文本中的一行中,而不是分成几行。您可以使用此文件进行实验。我使用文本进行情感分析。换行并不重要,但有时一个单词被分成两部分,这会在该单词很重要时导致问题。其他答案将用于情绪分析,但您必须进行一些数据清理,以处理PyPDF2输出中的拆分单词。P.S。欢迎使用堆栈溢出!!如果您需要与此问题相关的任何其他信息,请告知。快乐编码!!谢谢完美地工作
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = io.StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
    if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf 
            text = convert(pdfFilename) #get string of text content of pdf
            textFilename = txtDir + pdf + ".txt"
            textFile = open(textFilename, "w") #make text file
            textFile.write(text) #write text to text file

# set paths accordingly:
pdfDir = "C:/your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)