Python 从pdf转换为文本:行和字已断开
我想通过PyPDF2将pdf文件转换为文本,但转换后的文本看起来与pdf文件不同。具体来说,PDF中的一行在文本中被分成多行,单词也可能被打断。附件是PDF和文本文件,我得到的代码如下。有人能帮我解决这个问题吗Python 从pdf转换为文本:行和字已断开,python,python-3.x,pypdf2,pypdf,Python,Python 3.x,Pypdf2,Pypdf,我想通过PyPDF2将pdf文件转换为文本,但转换后的文本看起来与pdf文件不同。具体来说,PDF中的一行在文本中被分成多行,单词也可能被打断。附件是PDF和文本文件,我得到的代码如下。有人能帮我解决这个问题吗 enter code here import PyPDF2 def extractPdfText(filePath=''): # Open the pdf file in read binary mode. fileObject = open(filePath, 'rb') # r
enter code here
import PyPDF2
def extractPdfText(filePath=''):
# Open the pdf file in read binary mode.
fileObject = open(filePath, 'rb') # rb
# Create a pdf reader .
pdfFileReader = PyPDF2.PdfFileReader(fileObject)
# Get total pdf page number.
totalPageNumber = pdfFileReader.numPages
# Print pdf total page number.
print('This pdf file contains totally ' + str(totalPageNumber) + ' pages.')
currentPageNumber = 0
text = ''
# Loop in all the pdf pages.
while(currentPageNumber < totalPageNumber ):
# Get the specified pdf page object.
pdfPage = pdfFileReader.getPage(currentPageNumber)
# Get pdf page text.
text = text + pdfPage.extractText()
# Process next page.
currentPageNumber += 1
return text
pdfFilePath = 'PDF file path'
pdfText = extractPdfText(pdfFilePath)
在此处输入代码
导入PyPDF2
def extractPdfText(文件路径=“”):
#以读取二进制模式打开pdf文件。
fileObject=open(filePath,'rb')#rb
#创建一个pdf阅读器。
PdfileReader=PyPDF2.PdfileReader(文件对象)
#获取pdf总页码。
totalPageNumber=PdfileReader.numPages
#打印pdf总页码。
打印('此pdf文件总共包含'+str(totalPageNumber)+'页')
currentPageNumber=0
文本=“”
#循环所有pdf页面。
而(currentPageNumber
此答案使用encode('utf-8')将每页的输出保持在一起。我不知道您需要什么输出,因为您的问题中没有具体说明
from PyPDF2 import PdfFileReader
def pdf_text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# Get total pdf page number.
totalPageNumber = pdf.numPages
currentPageNumber = 0
while (currentPageNumber < totalPageNumber):
page = pdf.getPage(currentPageNumber)
text = page.extractText()
# The encoding put each page on a single line.
# type is <class 'bytes'>
print(text.encode('utf-8'))
#################################
# This outputs the text to a list,
# but it doesn't keep paragraphs
# together
#################################
# output = text.encode('utf-8')
# split = str(output, 'utf-8').split('\n')
# print (split)
#################################
# Process next page.
currentPageNumber += 1
path = 'mypdf.pdf'
pdf_text_extractor(path)
这意味着提取与PDF格式文本完全相同的文本可能会有问题
您可以使用tika来完成此任务,但它也不会完全干净
from tika import parser
parse_entire_pdf = parser.from_file('mypdf.pdf', xmlContent=True)
parse_entire_pdf = parse_entire_pdf['content']
print (parse_entire_pdf)
真正的问题是——您打算如何使用提取的文本?这是我将如何做到的
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)
谢谢,生活是复杂的。我想得到类似pdf的输出,这意味着pdf中的一行保存在文本中的一行中,而不是分成几行。您可以使用此文件进行实验。我使用文本进行情感分析。换行并不重要,但有时一个单词被分成两部分,这会在该单词很重要时导致问题。其他答案将用于情绪分析,但您必须进行一些数据清理,以处理PyPDF2输出中的拆分单词。P.S。欢迎使用堆栈溢出!!如果您需要与此问题相关的任何其他信息,请告知。快乐编码!!谢谢完美地工作
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)