Python 如何通过OCR将扫描的PDF获取到Excel?
我使用的代码是:-Python 如何通过OCR将扫描的PDF获取到Excel?,python,python-3.x,ocr,tesseract,Python,Python 3.x,Ocr,Tesseract,我使用的代码是:- import os import tempfile import subprocess def ocr(path): process = subprocess.Popen( ['tesseract', path, 'stdout'], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = process.communica
import os
import tempfile
import subprocess
def ocr(path):
process = subprocess.Popen(
['tesseract', path, 'stdout'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if stderr: # handle errors anyway you want, I'll just raise exception as example
raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
else: # success
return stdout.decode()
str = ocr(r'C:/Users/renu.sharma/Documents/Ragini/File.pdf')
print(str)
错误是:-
FileNotFoundError Traceback (most recent call last)
<ipython-input-41-6838d49f4a48> in <module>
18 return stdout.decode()
19
---> 20 str = ocr(r'C:/Users/renu.sharma/Documents/Ragini/File.pdf')
21 #print(str)
22
<ipython-input-41-6838d49f4a48> in ocr(path)
8 ['tesseract', path, 'stdout'],
9 stdout=subprocess.PIPE,
---> 10 stderr=subprocess.PIPE
11 )
12
~\AppData\Local\Continuum\anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
767 c2pread, c2pwrite,
768 errread, errwrite,
--> 769 restore_signals, start_new_session)
770 except:
771 # Cleanup if the child failed starting.
~\AppData\Local\Continuum\anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1170 env,
1171 os.fspath(cwd) if cwd is not None else None,
-> 1172 startupinfo)
1173 finally:
1174 # Child is launched. Close the parent's copy of those pipe
FileNotFoundError: [WinError 2] The system cannot find the file specified
这也给出了belwo错误
[WinError 267] The directory name is invalid: 'C:/Users/renu.sharma/Documents/Ragini/File.pdf'
我有一份各种名片的PDF扫描件&我正试图将这些详细信息输入Excel
下面是我迄今为止尝试过的代码
import os
import tempfile
import subprocess
def ocr(path):
temp = tempfile.NamedTemporaryFile(delete=False)
process = subprocess.Popen(['tesseract', path, temp.name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process.communicate()
with open(temp.name + '.txt', 'r') as handle:
contents = handle.read()
os.remove(temp.name + '.txt')
os.remove(temp.name)
return contents
str = ocr(r'C:\Users\XXXX\Documents\XXXX\file_Name')
print(str)
下面是我得到的错误:
FileNotFoundError: [WinError 2] The system cannot find the file specified
这似乎是一个非常简单的问题,但我无法找出问题所在。我只想完全避免使用临时文件,因此您可以执行以下操作:
import os
import tempfile
import subprocess
def ocr(path):
process = subprocess.Popen(
['tesseract', path, 'stdout'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if stderr: # handle errors anyway you want, I'll just raise exception as example
raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
else: # success
return stdout.decode()
是否需要使用
临时文件
?我相信在Windows上,其他进程可能无法从中读取数据。还有其他方法使用tempfile吗?我有多个PDF&需要从中提取文本,是否有可能实现上述以外的其他方式,我很高兴了解!!仍显示以下错误:FileNotFoundError:[WinError 2]系统找不到该文件specified@renu发布完整的错误消息,显示您正在使用的代码。我已经用我正在使用的整个代码以及整个错误消息编辑了问题
import os
import tempfile
import subprocess
def ocr(path):
process = subprocess.Popen(
['tesseract', path, 'stdout'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if stderr: # handle errors anyway you want, I'll just raise exception as example
raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
else: # success
return stdout.decode()