Javascript 读取代码点时出现偏移问题_Javascript_Node.js_Unicode

Javascript 读取代码点时出现偏移问题

javascript node.js unicode

Javascript 读取代码点时出现偏移问题,javascript,node.js,unicode,Javascript,Node.js,Unicode,恢复：我目前正在编写一个ActionScript 3 lexer，将源代码转换为令牌。我选择按代码点解释输入，这是一个包含可选代理项对的字符串，包装在类UString中。在引擎盖下，我使用UStringPos类缓存上次读取的位置我已经测试了它如何用 'use strict'; import {Lexer} from 'core/Lexer'; import {UString} from 'utils/UString'; import ErrorHandler from 'co

恢复：我目前正在编写一个ActionScript 3 lexer，将源代码转换为令牌。我选择按代码点解释输入，这是一个包含可选代理项对的字符串，包装在类

UString

中。在引擎盖下，我使用

UStringPos

类缓存上次读取的位置

我已经测试了它如何用

'use strict';

import {Lexer}      from 'core/Lexer';
import {UString}    from 'utils/UString';
import ErrorHandler from 'core/ErrorHandler';

const errorHandler = new ErrorHandler(true);

// Tell the length to the `Lexer` manually.
const lexer = new Lexer(
  new UString('huehuehue'), 9, errorHandler);

// Scan first token
lexer.next();

const id = lexer.lookahead.value;

console.log(
    id,
    id.length
);

它本应该记录“huehuehue”，9，但这是另一个故事

为什么它缺少最后一个

'e'

？与扫描此相关的最内层方法是

Lexer#getCommonIdentifier

。我已经测试了我的

UString

部分，顺便说一下，它工作正常

词法相关定义

/*
 * Class that turns AS3 code into tokens.
 */
export class Lexer
{
  /*
   * @param {UString} source
   * @param {Number} length
   * @param {ErrorHandler} errorHandler
   */
  constructor(source, length, errorHandler)
  {
    this.source = source;
    this.length = length;
    this.index = 0;
    this.lineStart = 0;
    this.lineNumber = 1;
    this.comments = [];

    this.errorHandler = errorHandler;

    this.previousToken = null;
    this.token         = null;
    this.lookahead     = null;

    this._special = [];
  }

  /*
   * Verifies the end of file.
   */
  eof()
  {
    return this.index >= this.length;
  }

  /*
   * Advance the previous, current and lookahead tokens.
   * The lexer however does not depend on these tokens.
   */
  next()
  {
    this.previousToken = this.token;
    this.token         = this.lookahead;
    this.lookahead     = this.lex();
  }

  /*
   * Consumes the next token and return it.
   */
  lex()
  {
    this.consumeWhiteSpaces();

    while (this.consumeComment())
      this.consumeWhiteSpaces();

    let cp = this.source.codePointAt(this.index);

    let pureIdentifier =
      Character.isIdentifierStart(cp);

    if (pureIdentifier || (cp === 0x5C))
      return this.scanIdentifierOrKeyword(!pureIdentifier);

    if (this.eof())
    {
      let loc = [ this.index, this.lineNumber ];
      return new Token(TokenType.EOF, loc, loc, '<end>');
    }
  }

  /*
   * Scan an identifier, keyword or boolean literal.
   */
  scanIdentifierOrKeyword(usingEscape)
  {
    const start = this.index;
    let id;

    /* Like Esprima does: only identifiers containing
     * escapes need some overheads. */
    if (usingEscape)
    {
      id = this.getEscapedIdentifier(
        String.fromCodePoint(this.scanUnicodeEscapeSequence()));
    }
    else
      id = this.getCommonIdentifier();

    return new Token(
      TokenType.IDENTIFIER,
      [ start     , this.lineNumber ],
      [ this.index, this.lineNumber ],
      id
    );
  }

  /*
   * Interprets an identifier. If any escape appears, switches to
   * getEscapedIdentifier().
   */
  getCommonIdentifier()
  {
    const start = this.source.position.offset;
    let cp = 0;

    // Jump the starting symbol.
    ++this.index;

    while (!this.eof())
    {
      cp = this.source.codePointAt(this.index);

      if (Character.isIdentifierPart(cp))
        ++this.index;

      // Switches to escape-minded task...
      else if (cp === 0x5C)
        return this.getUnicodeEscapedIdentifier(
          this.source.string.slice(
            start, this.source.position.offset
          )
        );

      else break;
    }
    return this.source.string.slice(
      start, this.source.position.offset
    );
  }

  /* ... */
}

'use strict';

/*
 * String wrapper with methods _based_ on code points.
 */
export class UString
{
  /*
   * Constructs the {UString}.
   *
   * @param {String} s String to be wrapped.
   */
  constructor(s)
  {
    /*
     * @type {String}
     */
    this.string = s;

    /*
     * Tracks the last accessed position.
     *
     * @type {UStringPos}
     */
    this.position = new UStringPos(0, 0);
  }

  /*
   * Reads a code point at specific index.
   *
   * @param {Number} index
   * @return {Number}
   */
  codePointAt(index)
  {
    this.position.walk(this.string, index);
    return this.string.codePointAt(this.position.offset);
  }

  /*
   * Slices the internal string by code point indices.
   *
   * @param {Number} i
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, i);
    i = this.position.offset;

    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }
};

/*
 * Class that tracks the position of a code point on a string.
 */
export class UStringPos
{
  /*
   * Constructs the {UStringPos}.
   *
   * @param {Number} index The initial index.
   * @param {Number} offset The initial offset.
   */
  constructor(index, offset)
  {
    /*
     * @type {Number}
     */
    this.index = index;

    /*
     * @type {Number}
     */
    this.offset = offset;
  }

  /*
   * Walks to the given index.
   *
   * @param {String} s
   * @param {Number} index
   * @note No backward. Track the previous position instead.
   * @return {void}
   */
  walk(s, index)
  {
    for (; this.index < index; ++this.index)
      this.offset += (
        this._usingSurrogates(
          s.charCodeAt(this.offset)
        ) ? 2 : 1
      );
  }

  /*
   * @private
   */
  _usingSurrogates(ch)
  {
    return (ch >= 0xD800) && (ch <= 0xDBFF);
  }
};

/*
*类，该类将AS3代码转换为令牌。
*/
导出类Lexer
{
/*
*@param{UString}源
*@param{Number}长度
*@param{ErrorHandler}ErrorHandler
*/
构造函数（源、长度、errorHandler）
{
this.source=源；
这个长度=长度；
该指数=0；
this.lineStart=0；
此参数为.lineNumber=1；
this.comments=[]；
this.errorHandler=errorHandler；
this.previousToken=null；
this.token=null；
this.lookahead=null；
这个。_special=[]；
}
/*
*验证文件的结尾。
*/
eof（）
{
返回this.index>=this.length；
}
/*
*推进上一个、当前和前瞻令牌。
*然而，lexer并不依赖于这些令牌。
*/
下一个（）
{
this.previousToken=this.token；
this.token=this.lookahead；
this.lookahead=this.lex（）；
}
/*
*使用下一个令牌并返回它。
*/
法（）
{
这个函数使用空格（）；
而（this.consumercomment（））
这个函数使用空格（）；
设cp=this.source.codePointAt（this.index）；
让pureIdentifier=
字符。isIdentifierStart（cp）；
if（pureIdentifier | |（cp==0x5C））
返回此.ScanIdentifierRokeyWord（！pureIdentifier）；
if（this.eof（））
{
设loc=[this.index，this.lineNumber]；
返回新令牌（TokenType.EOF，loc，loc“”）；
}
}
/*
*扫描标识符、关键字或布尔文字。
*/
扫描仪识别器RKEYWORD（usingEscape）
{
const start=this.index；
让我看看你的身份证；
/*与Esprima一样：只有包含
*逃跑需要一些开销*/
if（usingEscape）
{
id=this.getEscapeIdentifier(
String.fromCodePoint（this.scannecDeeScapeSequence（））；
}
其他的
id=this.getCommonIdentifier（）；
返回新令牌(
TokenType.IDENTIFIER，
[开始，this.lineNumber]，
[this.index，this.lineNumber]，
身份证件
);
}
/*
*解释标识符。如果出现任何转义，则切换到
*GetEscapeIdentifier（）。
*/
getCommonIdentifier（）
{
const start=this.source.position.offset；
设cp=0；
//跳转起始符号。
++这个指数；
而（！this.eof（））
{
cp=this.source.codePointAt（this.index）；
if（字符isIdentifierPart（cp））
++这个指数；
//切换到逃避任务。。。
else if（cp==0x5C）
返回此.GetUnicodeScapeDidentifier(
this.source.string.slice(
开始，this.source.position.offset
)
);
否则就断了；
}
返回this.source.string.slice(
开始，this.source.position.offset
);
}
/* ... */
}

utils/UString.js

/*
 * Class that turns AS3 code into tokens.
 */
export class Lexer
{
  /*
   * @param {UString} source
   * @param {Number} length
   * @param {ErrorHandler} errorHandler
   */
  constructor(source, length, errorHandler)
  {
    this.source = source;
    this.length = length;
    this.index = 0;
    this.lineStart = 0;
    this.lineNumber = 1;
    this.comments = [];

    this.errorHandler = errorHandler;

    this.previousToken = null;
    this.token         = null;
    this.lookahead     = null;

    this._special = [];
  }

  /*
   * Verifies the end of file.
   */
  eof()
  {
    return this.index >= this.length;
  }

  /*
   * Advance the previous, current and lookahead tokens.
   * The lexer however does not depend on these tokens.
   */
  next()
  {
    this.previousToken = this.token;
    this.token         = this.lookahead;
    this.lookahead     = this.lex();
  }

  /*
   * Consumes the next token and return it.
   */
  lex()
  {
    this.consumeWhiteSpaces();

    while (this.consumeComment())
      this.consumeWhiteSpaces();

    let cp = this.source.codePointAt(this.index);

    let pureIdentifier =
      Character.isIdentifierStart(cp);

    if (pureIdentifier || (cp === 0x5C))
      return this.scanIdentifierOrKeyword(!pureIdentifier);

    if (this.eof())
    {
      let loc = [ this.index, this.lineNumber ];
      return new Token(TokenType.EOF, loc, loc, '<end>');
    }
  }

  /*
   * Scan an identifier, keyword or boolean literal.
   */
  scanIdentifierOrKeyword(usingEscape)
  {
    const start = this.index;
    let id;

    /* Like Esprima does: only identifiers containing
     * escapes need some overheads. */
    if (usingEscape)
    {
      id = this.getEscapedIdentifier(
        String.fromCodePoint(this.scanUnicodeEscapeSequence()));
    }
    else
      id = this.getCommonIdentifier();

    return new Token(
      TokenType.IDENTIFIER,
      [ start     , this.lineNumber ],
      [ this.index, this.lineNumber ],
      id
    );
  }

  /*
   * Interprets an identifier. If any escape appears, switches to
   * getEscapedIdentifier().
   */
  getCommonIdentifier()
  {
    const start = this.source.position.offset;
    let cp = 0;

    // Jump the starting symbol.
    ++this.index;

    while (!this.eof())
    {
      cp = this.source.codePointAt(this.index);

      if (Character.isIdentifierPart(cp))
        ++this.index;

      // Switches to escape-minded task...
      else if (cp === 0x5C)
        return this.getUnicodeEscapedIdentifier(
          this.source.string.slice(
            start, this.source.position.offset
          )
        );

      else break;
    }
    return this.source.string.slice(
      start, this.source.position.offset
    );
  }

  /* ... */
}

'use strict';

/*
 * String wrapper with methods _based_ on code points.
 */
export class UString
{
  /*
   * Constructs the {UString}.
   *
   * @param {String} s String to be wrapped.
   */
  constructor(s)
  {
    /*
     * @type {String}
     */
    this.string = s;

    /*
     * Tracks the last accessed position.
     *
     * @type {UStringPos}
     */
    this.position = new UStringPos(0, 0);
  }

  /*
   * Reads a code point at specific index.
   *
   * @param {Number} index
   * @return {Number}
   */
  codePointAt(index)
  {
    this.position.walk(this.string, index);
    return this.string.codePointAt(this.position.offset);
  }

  /*
   * Slices the internal string by code point indices.
   *
   * @param {Number} i
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, i);
    i = this.position.offset;

    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }
};

/*
 * Class that tracks the position of a code point on a string.
 */
export class UStringPos
{
  /*
   * Constructs the {UStringPos}.
   *
   * @param {Number} index The initial index.
   * @param {Number} offset The initial offset.
   */
  constructor(index, offset)
  {
    /*
     * @type {Number}
     */
    this.index = index;

    /*
     * @type {Number}
     */
    this.offset = offset;
  }

  /*
   * Walks to the given index.
   *
   * @param {String} s
   * @param {Number} index
   * @note No backward. Track the previous position instead.
   * @return {void}
   */
  walk(s, index)
  {
    for (; this.index < index; ++this.index)
      this.offset += (
        this._usingSurrogates(
          s.charCodeAt(this.offset)
        ) ? 2 : 1
      );
  }

  /*
   * @private
   */
  _usingSurrogates(ch)
  {
    return (ch >= 0xD800) && (ch <= 0xDBFF);
  }
};

“严格使用”；
/*
*基于代码点的方法的字符串包装器。
*/
出口类贸易
{
/*
*构造{UString}。
*
*@param{String}要包装的字符串。
*/
建造商
{
/*
*@type{String}
*/
this.string=s；
/*
*跟踪最后访问的位置。
*
*@type{UStringPos}
*/
this.position=新的UStringPos（0,0）；
}
/*
*读取特定索引处的代码点。
*
*@param{Number}索引
*@return{Number}
*/
代码点（索引）
{
this.position.walk（this.string，index）；
返回this.string.codepoint（this.position.offset）；
}
/*
*按代码点索引对内部字符串进行切片。
*
*@param{Number}i
*@param{Number}j
*@return{String}
*/
切片（i，j）
{
this.position.walk（this.string，i）；
i=此.position.offset；
this.position.walk（this.string，j）；
j=此.position.offset；
返回这个.string.slice（i，j）；
}
};
/*
*类，该类跟踪字符串上代码点的位置。
*/
出口类UStringPos
{
/*
*构造{UStringPos}。
*
*@param{Number}索引初始索引。
*@param{Number}偏移初始偏移量。
*/
构造函数（索引、偏移）
{
/*
*@type{Number}
*/
这个指数=指数；
/*
*@type{Number}
*/
这个偏移量=偏移量；
}
/*
*走到给定的索引。
*
*@param{String}s
*@param{Number}索引
*@注意不要后退。改为跟踪上一个位置。
*@return{void}
*/
步行（s，索引）
{
for（；this.indexreturn（ch>=0xD800）和&（ch好的。所以这是一个这个.source.position.offset
的问题：当我做++this.index
时，我的UStringPos
的偏移量没有更新。问题是切片的问题
    this.source.string.slice(
      start, this.source.position.offset
    );

这个片段基于偏移量，因为我必须跟踪标识符开始的前一个偏移量
解决方案
我可以使用自己的UString
类的切片，并将第一个参数用作偏移量，最后一个参数用作普通索引
'use strict';

export class UString
{
  // ...

  /*
   * Slices the internal string by using a pair of
   * offset and code point indices.
   *
   * @param {Number} i Offset
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }

};