Javascript 获取PDF格式的tessearct.js结果时出现问题

Javascript 获取PDF格式的tessearct.js结果时出现问题,javascript,node.js,express,multer,tesseract.js,Javascript,Node.js,Express,Multer,Tesseract.js,我正在使用tesseract.js构建一个简单的javaScript OCR(光学内容识别)应用程序,并将{tess_create_pdf:“1”}传递到.recognize()方法中,以获得pdf格式的结果,但它不起作用。所以请有人告诉我我正在制造的问题 const express = require('express'); const app = express(); const fs = require('fs'); const multer = require

我正在使用tesseract.js构建一个简单的javaScript OCR(光学内容识别)应用程序,并将{tess_create_pdf:“1”}传递到.recognize()方法中,以获得pdf格式的结果,但它不起作用。所以请有人告诉我我正在制造的问题

const express  = require('express');
const app      = express();
const fs       = require('fs');
const multer   = require('multer');
const { createWorker } = require("tesseract.js");
const worker           = createWorker();

app.set("view engine", "ejs");
const storage = multer.diskStorage({
    destination: (req, file, cb) => {
        cb(null, './uploads');
    },
    filename: (req, file, cb) => {
        cb(null, file.originalname);
    }
})

const upload = multer({storage: storage}).single('avatar');

app.get('/', (req, res) => {
    res.render('index');
});

app.post('/upload', (req, res) => {
    upload(req, res, err => {
        fs.readFile(`./uploads/${req.file.originalname}`, (err, data) => {
        if(err) return console.log('this is your error', err);

        (async ()=> {
            await worker.load();
            await worker.loadLanguage('eng');
            await worker.initialize('eng');
            const { data: { text } } = await worker.recognize(data, { tessjs_create_pdf: "1"});
            res.send(text);
            await worker.terminate();
        })();
    });
  })
})

var port = 3000 || process.env.PORT;
app.listen(port, () => {
     console.log("server has started!!!!");
})
你可以使用pdf工具包

const express = require('express');
const app = express();
const fs = require('fs');
const multer = require('multer');
const { createWorker } = require('tesseract.js');
const worker = createWorker({
  logger: m => console.log(m)
});
const cors = require('cors');
const PDFDocument = require('pdfkit');
// Create a document
const doc = new PDFDocument();
// Pipe its output somewhere, like to a file or HTTP response
// See below for browser usage
doc.pipe(fs.createWriteStream('tesseract.js-ocr-result.pdf'));
app.use(cors());
var bodyParser = require('body-parser');
app.use(bodyParser.json({ limit: '50mb' }));
app.use(
  bodyParser.urlencoded({
    extended: true,
    limit: '50mb',
    parameterLimit: 1000000
  })
);
var Storage = multer.diskStorage({
  destination: (req, file, callback) => {
    callback(null, __dirname + '/images');
  },
  filename: (req, file, callback) => {
    callback(null, file.originalname);
  }
});
var upload = multer({
  storage: Storage
}).single('avatar');
app.post('/upload', (req, res) => {
  upload(req, res, err => {
    console.log('Request ---', req.body);
    console.log('Request file ---', req.file);

    fs.readFile(`./images/${req.file.originalname}`, (err, image) => {
      if (err) {
        console.log(err);
      }
      (async () => {
        await worker.load();
        await worker.loadLanguage('eng');
        await worker.initialize('eng');
        const {
          data: { text }
        } = await worker.recognize(image);

        doc.image(image, {
          fit: [250, 300],
          align: 'center',
          valign: 'center'
        });
        doc
          .addPage()
          .fontSize(25)
          .text(text);
        doc.end();

        await worker.terminate();
      })();
    });
  });
});
app.get('/download', (req, res) => {
  const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
  res.download(file);
});
app.listen(5000, () => {
  console.log('server Started');
});
您必须使用getPDF()函数来生成PDF文件。(在tesseract.js 2.1.4中) 在识别文本后和终止worker之前添加以下代码。在fs中使用writeFileSync将文件写入磁盘

const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
如果要下载生成的PDF文件,请使用以下命令将用户重定向到其他路径

    res.redirect("/download")
…并将以下代码添加到您的路线中。文件将保存在根目录中,因此我们可以使用_dirname作为路径

app.get("/download", (req, res) => {
    const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
    res.download(file);
});
您的最终代码将如下所示。为了避免冲突,我在fs.readline行中将参数数据重命名为img。。。(第3行)在以下代码中

app.post('/upload', (req, res) => {
    upload(req, res, err => {
        fs.readFile(`./uploads/${req.file.originalname}`, (err, img) => {
        if(err) return console.log('this is your error', err);

        (async ()=> {
            await worker.load();
            await worker.loadLanguage('eng');
            await worker.initialize('eng');
            await worker.recognize(img);

            const { data } = await worker.getPDF("Tesseract OCR Result");
            fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
            res.redirect("/download");

            await worker.terminate();
        })();
    });
  })
})

app.get("/download", (req, res) => {
    const file = `${__dirname}/tesseract-ocr-result.pdf`;
    res.download(file);
});