Python 如何通过OCR将扫描的PDF获取到Excel？_Python_Python 3.x_Ocr_Tesseract

Python 如何通过OCR将扫描的PDF获取到Excel？

python python-3.x

Python 如何通过OCR将扫描的PDF获取到Excel？,python,python-3.x,ocr,tesseract,Python,Python 3.x,Ocr,Tesseract,我使用的代码是：- import os import tempfile import subprocess def ocr(path): process = subprocess.Popen( ['tesseract', path, 'stdout'], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = process.communica

我使用的代码是：-

import os
import tempfile
import subprocess

def ocr(path):

    process = subprocess.Popen(
        ['tesseract', path, 'stdout'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    stdout, stderr = process.communicate()

    if stderr: # handle errors anyway you want, I'll just raise exception as example
        raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
    else: # success
        return stdout.decode()

str = ocr(r'C:/Users/renu.sharma/Documents/Ragini/File.pdf')
print(str)

错误是：-

FileNotFoundError                         Traceback (most recent call last)
<ipython-input-41-6838d49f4a48> in <module>
     18         return stdout.decode()
     19 
---> 20 str = ocr(r'C:/Users/renu.sharma/Documents/Ragini/File.pdf')
     21 #print(str)
     22 

<ipython-input-41-6838d49f4a48> in ocr(path)
      8         ['tesseract', path, 'stdout'],
      9         stdout=subprocess.PIPE,
---> 10         stderr=subprocess.PIPE
     11     )
     12 

~\AppData\Local\Continuum\anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    767                                 c2pread, c2pwrite,
    768                                 errread, errwrite,
--> 769                                 restore_signals, start_new_session)
    770         except:
    771             # Cleanup if the child failed starting.

~\AppData\Local\Continuum\anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1170                                          env,
   1171                                          os.fspath(cwd) if cwd is not None else None,
-> 1172                                          startupinfo)
   1173             finally:
   1174                 # Child is launched. Close the parent's copy of those pipe

FileNotFoundError: [WinError 2] The system cannot find the file specified

这也给出了belwo错误

[WinError 267] The directory name is invalid: 'C:/Users/renu.sharma/Documents/Ragini/File.pdf'

我有一份各种名片的PDF扫描件&我正试图将这些详细信息输入Excel

下面是我迄今为止尝试过的代码

import os
import tempfile
import subprocess

def ocr(path):
    temp = tempfile.NamedTemporaryFile(delete=False)

    process = subprocess.Popen(['tesseract', path, temp.name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    process.communicate()

    with open(temp.name + '.txt', 'r') as handle:
        contents = handle.read()

    os.remove(temp.name + '.txt')
    os.remove(temp.name)

    return contents

str = ocr(r'C:\Users\XXXX\Documents\XXXX\file_Name')
print(str)

下面是我得到的错误：

FileNotFoundError: [WinError 2] The system cannot find the file specified

这似乎是一个非常简单的问题，但我无法找出问题所在。

我只想完全避免使用临时文件，因此您可以执行以下操作：

import os
import tempfile
import subprocess

def ocr(path):

    process = subprocess.Popen(
        ['tesseract', path, 'stdout'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    stdout, stderr = process.communicate()

    if stderr: # handle errors anyway you want, I'll just raise exception as example
        raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
    else: # success
        return stdout.decode()

是否需要使用

临时文件

？我相信在Windows上，其他进程可能无法从中读取数据。还有其他方法使用tempfile吗？我有多个PDF&需要从中提取文本，是否有可能实现上述以外的其他方式，我很高兴了解！！仍显示以下错误：FileNotFoundError:[WinError 2]系统找不到该文件specified@renu发布完整的错误消息，显示您正在使用的代码。我已经用我正在使用的整个代码以及整个错误消息编辑了问题

import os
import tempfile
import subprocess

def ocr(path):

    process = subprocess.Popen(
        ['tesseract', path, 'stdout'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    stdout, stderr = process.communicate()

    if stderr: # handle errors anyway you want, I'll just raise exception as example
        raise RuntimeError(f'tesserect failed with: {stderr.decode()}')
    else: # success
        return stdout.decode()