Javascript 读取一个大文本文件的n行_Javascript_Html_File_Io_Bigdata

Javascript 读取一个大文本文件的n行

javascript html file io

Javascript 读取一个大文本文件的n行,javascript,html,file,io,bigdata,Javascript,Html,File,Io,Bigdata,我拥有的最小文件有>850k行，并且每行的长度未知。目标是在浏览器中从此文件中读取n行。完全阅读它是不可能的这是我的HTML和JS： var n = 10; var reader = new FileReader(); reader.onload = function(progressEvent) { // Entire file console.log(this.result); // By lines var lines = this.result.split('\n')

我拥有的最小文件有>850k行，并且每行的长度未知。目标是在浏览器中从此文件中读取

行。完全阅读它是不可能的

这是我的HTML

和JS：

var n = 10;
var reader = new FileReader();
reader.onload = function(progressEvent) {
  // Entire file
  console.log(this.result);

  // By lines
  var lines = this.result.split('\n');
  for (var line = 0; line < n; line++) {
    console.log(lines[line]);
  }
};

编辑：

方法是这样的，但我不知道如何修改它来读取文件的

行

通过使用too，您可以从那里执行以下操作：

var view = new Uint8Array(fr.result);
var string = new TextDecoder("utf-8").decode(view);
console.log("Chunk " + string);

但是，这可能不会把最后一行作为一个整体来阅读，那么您以后将如何确定这些行呢？例如，下面是它打印的内容：

((7202), (u'11330875493', u'2554375661'))
((1667), (u'9079074735', u'6883914476',

逻辑与我在回答中所写的非常相似，只是您需要跟踪到目前为止处理的行数（以及到目前为止读取的最后一行，因为它可能还没有结束）。下一个示例适用于与UTF-8兼容的任何编码；如果需要另一种编码，请查看构造函数的选项

如果您确定输入为ASCII（或任何其他单字节编码），则也可以跳过

textdecker

的使用，直接使用

//这只是下面函数的一个示例。
document.getElementById（'start'）。onclick=function（）{
var file=document.getElementById（'infle'）.files[0]；
如果（！文件）{
log（'未选择任何文件'）；
返回；
}
var maxlines=parseInt（document.getElementById（'maxlines'）。值，10）；
var-lineno=1；
//readSomeLines的定义如下。
readSomeLines（文件、maxlines、函数（行）{
log（“行：”+（lineno++）++行）；
}，函数onComplete（）{
console.log（“读取所有行”）；
});
};
/**
*读取并包括|文件|中的| maxlines |行。
*
*@param{Blob}file-要读取的文件。
*@param{integer}maxlines-要读取的最大行数。
*@param{function（string）}forEachLine-为每行调用。
*@param{function（error）}onComplete-在文件结束时调用
*达到或读取| maxlines |行时。
*/
函数readSomeLines（文件、maxlines、forEachLine、onComplete）{
var CHUNK_SIZE=50000；//50kb，任意选择。
var解码器=新的TextDecoder（）；
var偏移=0；
var-linecount=0；
var linenumber=0；
var结果=“”；
var fr=new FileReader（）；
fr.onload=函数（）{
//如果我们剪切文件，请使用stream:true
//在多字节字符的中间
results+=decoder.decode（fr.result，{stream:true}）；
var line=results.split（'\n'）；
results=lines.pop（）；//以防行尚未结束。
linecount+=lines.length；
如果（行数>最大行数）{
//阅读太多行？截断结果。
lines.length-=行数-最大行数；
linecount=最大行数；
}
对于（变量i=0；i=file.size）{
//我们没有找到所有的行，但没有更多的行。
forEachLine（results）；//这是来自lines.pop（），before。
onComplete（）；//完成
返回；
}
var slice=file.slice（偏移量，偏移量+块大小）；
fr.readAsArrayBuffer（切片）；
}
}

从中读取行
.

流是功能
whatwg团队正在研究关于可写+可读流的最后一个通量，很快就准备好了。但在那之前，有一个你可以使用的方法。他们正在研究一种从blob中获取可读流的方法。但我也创造了一种方法，通过以下方式以流媒体方式获取blob：

昨天，我还创建了一个simpel来处理web流

所以这可能非常简单，如下所示：

//模拟一个文件
var csv=
`苹果，1美元1.00
香蕉，4美元，0.20美元
橙色，3美元，0.79美元`
var文件=新Blob（[csv]）
var n=0
无功控制器
var解码器=新的文本解码器
var stdout=新的可写流({
启动（c）{
控制器=c
},
写入（块，a）{
//调用controller.error也会使署名处于错误状态
//导致文件流也停止读取更多数据
if（n==1）controller.error（“不需要更多行”）
区块=解码器。解码（区块）
log（`chunk[${n++}]：${chunk}`）
}
})
文件
.stream（）
.pipeThrough（署名（））
//.pipeThrough（新的文本解码器）类似的东西最终会起作用
.pipeTo（标准输出）

我需要在浏览器中读取250MB utf-8编码文件。我的解决方案是编写类似C#的TextReader类，它为我提供了类似于异步流的行为

文本阅读器类：

class TextReader {
    CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
    position = 0;
    length = 0;

    byteBuffer = new Uint8Array(0);

    lines = [];
    lineCount = 0;
    lineIndexTracker = 0;

    fileReader = new FileReader();
    textDecoder = new TextDecoder(`utf-8`);

    get allCachedLinesAreDispatched() {
        return !(this.lineIndexTracker < this.lineCount);
    }

    get blobIsReadInFull() {
        return !(this.position < this.length);
    }

    get bufferIsEmpty() {
        return this.byteBuffer.length === 0;
    }

    get endOfStream() {
        return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
    }

    constructor(blob) {
        this.blob = blob;
        this.length = blob.size;
    }

    blob2arrayBuffer(blob) {
        return new Promise((resolve, reject) => {
            this.fileReader.onerror = reject;
            this.fileReader.onload = () => {
                resolve(this.fileReader.result);
            };

            this.fileReader.readAsArrayBuffer(blob);
        });
    }

    read(offset, count) {
        return new Promise(async (resolve, reject) => {
            if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
                resolve(new ArrayBuffer(0));
                return
            }

            let endIndex = offset + count;

            if (endIndex > this.length) endIndex = this.length;

            let blobSlice = this.blob.slice(offset, endIndex);

            resolve(await this.blob2arrayBuffer(blobSlice));
        });
    }

    readLine() {
        return new Promise(async (resolve, reject) => {

            if (!this.allCachedLinesAreDispatched) {
                resolve(this.lines[this.lineIndexTracker++] + `\n`);
                return;
            }

            while (!this.blobIsReadInFull) {
                let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
                this.position += arrayBuffer.byteLength;

                let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
                tempByteBuffer.set(this.byteBuffer);
                tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);

                this.byteBuffer = tempByteBuffer;

                let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM

                if (lastIndexOfLineFeedCharacter > -1) {
                    let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
                    this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);

                    let firstLine = lines[0];

                    this.lines = lines.slice(1, lines.length - 1);
                    this.lineCount = this.lines.length;
                    this.lineIndexTracker = 0;

                    resolve(firstLine + `\n`);
                    return;
                }
            }

            if (!this.bufferIsEmpty) {
                let line = this.textDecoder.decode(this.byteBuffer);
                this.byteBuffer = new Uint8Array(0);
                resolve(line);
                return;
            }

            resolve(null);
        });
    }
}

document.getElementById("read").onclick = async () => {
    let file = document.getElementById("fileInput").files[0];
    let textReader = new TextReader(file);

    while(true) {
        let line = await textReader.readLine();
        if(line === null) break;
        // PROCESS LINE
    }

    // OR

    while (!textReader.endOfStream) {
        let line = await textReader.readLine();
        // PROCESS LINE
    }
};

性能：

class TextReader {
    CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
    position = 0;
    length = 0;

    byteBuffer = new Uint8Array(0);

    lines = [];
    lineCount = 0;
    lineIndexTracker = 0;

    fileReader = new FileReader();
    textDecoder = new TextDecoder(`utf-8`);

    get allCachedLinesAreDispatched() {
        return !(this.lineIndexTracker < this.lineCount);
    }

    get blobIsReadInFull() {
        return !(this.position < this.length);
    }

    get bufferIsEmpty() {
        return this.byteBuffer.length === 0;
    }

    get endOfStream() {
        return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
    }

    constructor(blob) {
        this.blob = blob;
        this.length = blob.size;
    }

    blob2arrayBuffer(blob) {
        return new Promise((resolve, reject) => {
            this.fileReader.onerror = reject;
            this.fileReader.onload = () => {
                resolve(this.fileReader.result);
            };

            this.fileReader.readAsArrayBuffer(blob);
        });
    }

    read(offset, count) {
        return new Promise(async (resolve, reject) => {
            if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
                resolve(new ArrayBuffer(0));
                return
            }

            let endIndex = offset + count;

            if (endIndex > this.length) endIndex = this.length;

            let blobSlice = this.blob.slice(offset, endIndex);

            resolve(await this.blob2arrayBuffer(blobSlice));
        });
    }

    readLine() {
        return new Promise(async (resolve, reject) => {

            if (!this.allCachedLinesAreDispatched) {
                resolve(this.lines[this.lineIndexTracker++] + `\n`);
                return;
            }

            while (!this.blobIsReadInFull) {
                let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
                this.position += arrayBuffer.byteLength;

                let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
                tempByteBuffer.set(this.byteBuffer);
                tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);

                this.byteBuffer = tempByteBuffer;

                let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM

                if (lastIndexOfLineFeedCharacter > -1) {
                    let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
                    this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);

                    let firstLine = lines[0];

                    this.lines = lines.slice(1, lines.length - 1);
                    this.lineCount = this.lines.length;
                    this.lineIndexTracker = 0;

                    resolve(firstLine + `\n`);
                    return;
                }
            }

            if (!this.bufferIsEmpty) {
                let line = this.textDecoder.decode(this.byteBuffer);
                this.byteBuffer = new Uint8Array(0);
                resolve(line);
                return;
            }

            resolve(null);
        });
    }
}

document.getElementById("read").onclick = async () => {
    let file = document.getElementById("fileInput").files[0];
    let textReader = new TextReader(file);

    while(true) {
        let line = await textReader.readLine();
        if(line === null) break;
        // PROCESS LINE
    }

    // OR

    while (!textReader.endOfStream) {
        let line = await textReader.readLine();
        // PROCESS LINE
    }
};

我能够在1.5秒内读取单个250MB utf-8编码的文本文件，其中包含1398258行，JS堆大小不超过20MB。相比之下，如果我一次读取同一个文件，然后将结果字符串拆分为\n，仍然需要~1.5s的时间。然而，JS Heap将达到230MB。

“…但这不重要。”你凭什么认为这不重要？！如果没有行开始位置的索引，也没有以给定索引增量读取文件的能力，这绝对重要。@T.J.Crowder我用澄清更新了我的问题，也许我应该删除该语句，你是对的！这里需要更多的上下文。您正在使用HTML和JavaScript。这是在web浏览器中运行的JavaScript吗？或者这个JavaScript是作为HTML帖子的响应执行的吗？啊，忘了