与外部程序之间的流式传输需要使用javascript节点的文件
问题: 我需要上传数百个PDF文档,将它们转换为HTML,然后将HTML存储在MongoDB中。我目前正在文件系统中保存传入的PDF文档和转换的HTML。有没有一种方法可以使用流来避免所有的文件I/O 当前方法(有效但速度较慢): 我正在使用:与外部程序之间的流式传输需要使用javascript节点的文件,javascript,node.js,mongodb,stream,child-process,Javascript,Node.js,Mongodb,Stream,Child Process,问题: 我需要上传数百个PDF文档,将它们转换为HTML,然后将HTML存储在MongoDB中。我目前正在文件系统中保存传入的PDF文档和转换的HTML。有没有一种方法可以使用流来避免所有的文件I/O 当前方法(有效但速度较慢): 我正在使用: 巴士男孩阅读上传的PDF文件,我保存到文件系统 我在node.js中创建了一个“exec”子进程,该子进程调用“'pdftohtml-c-s-noframes-nodrm'+inputFileNamePDF+''+outputFileNameHTML”。
var path = require("path"),
Busboy = require('busboy')
http = require('http'),
util = require('util'),
fs = require('fs-extra'),
pdftohtml = require('pdftohtmljs'),
exec =require('child_process').exec,
pdf_extract = require('pdf-extract'),
exports.postUpload = function (req, res) {
// parse a file upload
var fileName = "";
var uploadDir = '/tmp/' + res.locals.user._doc.email.replace(/[@\.]/g,"_");
var infiles = 0, outfiles = 0, done = false,
busboy = new Busboy({ headers: req.headers });
console.log('Start parsing form ...');
busboy.on('file', function (fieldname, file, filename) {
++infiles;
console.log("file event #" + infiles);
onFile(fieldname, file, filename, function () {
++outfiles;
console.log("file #" + infiles + " written.");
if (done) console.log(outfiles + '/' + infiles + ' parts written to disk');
if (done && infiles === outfiles) {
// ACTUAL EXIT CONDITION
console.log('All parts written to disk');
res.writeHead(200, { 'Connection': 'close' });
res.end("That's all folks!");
convertToHTMLTxt();
}
});
});
busboy.on('finish', function () {
console.log('Done parsing form!');
done = true;
});
req.pipe(busboy);
function onFile(fieldname, file, filename, next) {
// or save at some other location
var fileName = "";
fileName = filename.replace( /[^a-z0-9_\-]/gi,"_");
fileName = fileName.replace(/_(pdf|docx|doc)$/i,".$1");
var fstream = fs.createWriteStream(path.join(uploadDir, fileName));
file.on('end', function () {
console.log(fieldname + '(' + fileName + ') EOF');
});
fstream.on('close', function () {
console.log(fieldname + '(' + fileName + ') written to disk');
next();
});
console.log(fieldname + '(' + fileName + ') start saving');
file.pipe(fstream);
}
function convertToHTMLTxt () {
var execTxt, execHTML, execPDF;
var textDir = 'text';
var htmlDir = 'html';
console.log('Directory: ', uploadDir);
fs.readdir(uploadDir, function(err, files) {
if (err) {
console.log('error reading directory: ', uploadDir);
return;
}
files.forEach(function(fileName) {
var fileNameHTML = path.join(uploadDir, htmlDir,
fileName.replace(/(pdf|docx|doc)$/i,"html"));
var fileNamePDF = path.join(uploadDir, fileName);
if (fileName.match(/pdf$/i)) {
execPDF = exec('pdftohtml -c -s -noframes -nodrm '
+ fileNamePDF + ' ' + fileNameHTML,
function(error, stdout, stderr) {
console.log('stdout: ', stdout);
console.log('stderr: ', stderr);
if (error !== null) {
console.log('exec error: ', error);
}
});
execPDF.on('close', function (code) {
console.log('******** PDF to HTML Conversion complete - exit code '
+ code);
});
}
})
});
转换完成后,我迭代所有HTML文件并执行MongoDB批量升级:
fs.readFile(fileNameHTML, 'utf8', function (err, HTMLData) {
if (err) {
console.log('error reading file: ', fileNameHTML + '/nerror: ' + err);
callback(err);
return;
}
bulk.find({ userName: userName,
docName : fileName
}).upsert()
.updateOne({userName: userName,
docName : fileName,
HTMLData : HTMLData});