Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/php/246.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Tesseract与PHP性能_Php_Zend Framework_Tesseract - Fatal编程技术网

Tesseract与PHP性能

Tesseract与PHP性能,php,zend-framework,tesseract,Php,Zend Framework,Tesseract,我用Laminas和Mezzio(前Zend Expressive)实现了一个API。这里有一个处理程序,它使用thiagalesio\TesseractOCR库()从PHP调用Tesseract 在我的开发环境中,一切正常。通过调用API获取图像文本需要2-6秒 现在,我首先将API部署到Google云虚拟机上,现在将其部署到Raspberry Pi 4 4GB RAM模型上。两个都很慢!请求-响应需要25-30秒。Tesseract似乎不是问题所在。如果我从CLI调用它,它会非常快。但是简单

我用Laminas和Mezzio(前Zend Expressive)实现了一个API。这里有一个处理程序,它使用thiagalesio\TesseractOCR库()从PHP调用Tesseract

在我的开发环境中,一切正常。通过调用API获取图像文本需要2-6秒

现在,我首先将API部署到Google云虚拟机上,现在将其部署到Raspberry Pi 4 4GB RAM模型上。两个都很慢!请求-响应需要25-30秒。Tesseract似乎不是问题所在。如果我从CLI调用它,它会非常快。但是简单的API调用也不慢!似乎薄层/夹层与Tesseract的结合非常缓慢。我实际上什么也不做,只是从图像中提取文本并将其作为JSON响应发送回去

我正在apache2服务器上运行PHP7.3。Pi位于通过LAN连接的本地网络中。我正在用Postman测试API调用

如何提高性能?是硬件问题吗

这是我的处理程序代码

<?php

declare(strict_types=1);

namespace App\Handler;

use Laminas\Diactoros\Response\JsonResponse;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface;
use thiagoalessio\TesseractOCR\TesseractOCR;

class OcrHandler implements RequestHandlerInterface
{
    public function handle(ServerRequestInterface $request) : ResponseInterface
    {
        $measure = [];
        $start = microtime(true);

        $body = $request->getBody();

        $result = '';

        if(!empty($body->getContents())) {
            $measure['body_parse'] = microtime(true) - $start;
            $start = microtime(true);    

            $guid = $this->GUID();
            $imagePath = sprintf('%s/data/%s', getcwd(), $guid);

            file_put_contents($imagePath, $body->getContents());
            
            $measure['image_write'] = microtime(true) - $start;
            $start = microtime(true);

            $tesseractOcr = new TesseractOCR($imagePath);
            $tesseractOcr->withoutTempFiles();
            $result = $tesseractOcr->lang('deu')->run();
            
            $measure['image_parsing'] = microtime(true) - $start;
            $start = microtime(true);

            unlink($imagePath);

            $measure['image_delete'] = microtime(true) - $start;
        }

        return new JsonResponse(['result' => $result, 'measure' => $measure]);
    }

    private function GUID()
    {
        if (function_exists('com_create_guid') === true)
            return trim(com_create_guid(), '{}');
    
        return sprintf('%04X%04X-%04X-%04X-%04X-%04X%04X%04X', mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(16384, 20479), mt_rand(32768, 49151), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535));
    }
}

为什么它在CLI上如此之快,但在PHP中调用它时却如此之慢?是否有任何可能的性能改进?

好的,正如我在编辑中已经提到的,瓶形图似乎是图像解析。更具体地说,瓶颈是库“thiagoalessio/tesseract ocr for php”。下面的代码使用PHP的exec函数而不是库,需要5,82秒(相比之下是27,9秒)。这是一个巨大的差异。假设您的计算机上安装了tesseract,则以下代码可以正常工作:

<?php

declare(strict_types=1);

namespace App\Handler;

use Laminas\Diactoros\Response\JsonResponse;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface;

class OcrHandler implements RequestHandlerInterface
{
    public function handle(ServerRequestInterface $request) : ResponseInterface
    {
        $measure = [];
        $start = microtime(true);

        $body = $request->getBody();

        $result = '';

        if(!empty($body->getContents())) {
            $measure['body_parse'] = microtime(true) - $start;
            $start = microtime(true);    

            $guid = $this->GUID();
            $imagePath = sprintf('%s/data/%s', getcwd(), $guid);
            $outputPath = $imagePath . '_out';

            file_put_contents($imagePath, $body->getContents());
            
            $measure['image_write'] = microtime(true) - $start;
            $start = microtime(true);

            exec(sprintf('tesseract %s %s', $imagePath, $outputPath));
            $result = file_get_contents($outputPath . '.txt');
            
            $measure['image_parsing'] = microtime(true) - $start;
            $start = microtime(true);

            unlink($imagePath);
            unlink($outputPath . '.txt');

            $measure['image_delete'] = microtime(true) - $start;
        }

        return new JsonResponse(['result' => $result, 'measure' => $measure]);
    }

    private function GUID()
    {
        if (function_exists('com_create_guid') === true)
            return trim(com_create_guid(), '{}');
    
        return sprintf('%04X%04X-%04X-%04X-%04X-%04X%04X%04X', mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(16384, 20479), mt_rand(32768, 49151), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535));
    }
}

为什么不从用户请求中卸载资源,而是使用一个调度程序(cron)在一段时间内(比如1分钟)刮取数据呢。您是否立即需要最新的数据?还是即时数据的交付更重要?这一切都取决于你的规格。@Jaquarh不幸的是,我需要尽快得到结果。API是从另一个web应用程序调用的,该应用程序允许客户通过自动文本识别(总额、税额、税率等)上传发票。如果服务器成本太高,您的想法可能是一个很好的解决方案。
<?php

declare(strict_types=1);

namespace App\Handler;

use Laminas\Diactoros\Response\JsonResponse;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface;

class OcrHandler implements RequestHandlerInterface
{
    public function handle(ServerRequestInterface $request) : ResponseInterface
    {
        $measure = [];
        $start = microtime(true);

        $body = $request->getBody();

        $result = '';

        if(!empty($body->getContents())) {
            $measure['body_parse'] = microtime(true) - $start;
            $start = microtime(true);    

            $guid = $this->GUID();
            $imagePath = sprintf('%s/data/%s', getcwd(), $guid);
            $outputPath = $imagePath . '_out';

            file_put_contents($imagePath, $body->getContents());
            
            $measure['image_write'] = microtime(true) - $start;
            $start = microtime(true);

            exec(sprintf('tesseract %s %s', $imagePath, $outputPath));
            $result = file_get_contents($outputPath . '.txt');
            
            $measure['image_parsing'] = microtime(true) - $start;
            $start = microtime(true);

            unlink($imagePath);
            unlink($outputPath . '.txt');

            $measure['image_delete'] = microtime(true) - $start;
        }

        return new JsonResponse(['result' => $result, 'measure' => $measure]);
    }

    private function GUID()
    {
        if (function_exists('com_create_guid') === true)
            return trim(com_create_guid(), '{}');
    
        return sprintf('%04X%04X-%04X-%04X-%04X-%04X%04X%04X', mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(16384, 20479), mt_rand(32768, 49151), mt_rand(0, 65535), mt_rand(0, 65535), mt_rand(0, 65535));
    }
}