Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/node.js/41.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Javascript 读取代码点时出现偏移问题_Javascript_Node.js_Unicode - Fatal编程技术网

Javascript 读取代码点时出现偏移问题

Javascript 读取代码点时出现偏移问题,javascript,node.js,unicode,Javascript,Node.js,Unicode,恢复:我目前正在编写一个ActionScript 3 lexer,将源代码转换为令牌。我选择按代码点解释输入,这是一个包含可选代理项对的字符串,包装在类UString中。在引擎盖下,我使用UStringPos类缓存上次读取的位置 我已经测试了它如何用 'use strict'; import {Lexer} from 'core/Lexer'; import {UString} from 'utils/UString'; import ErrorHandler from 'co

恢复:我目前正在编写一个ActionScript 3 lexer,将源代码转换为令牌。我选择按代码点解释输入,这是一个包含可选代理项对的字符串,包装在类
UString
中。在引擎盖下,我使用
UStringPos
类缓存上次读取的位置

我已经测试了它如何用

'use strict';

import {Lexer}      from 'core/Lexer';
import {UString}    from 'utils/UString';
import ErrorHandler from 'core/ErrorHandler';

const errorHandler = new ErrorHandler(true);

// Tell the length to the `Lexer` manually.
const lexer = new Lexer(
  new UString('huehuehue'), 9, errorHandler);

// Scan first token
lexer.next();

const id = lexer.lookahead.value;

console.log(
    id,
    id.length
);
它本应该记录“huehuehue”,9,但这是另一个故事

为什么它缺少最后一个
'e'
?与扫描此相关的最内层方法是
Lexer#getCommonIdentifier
。我已经测试了我的
UString
部分,顺便说一下,它工作正常

词法相关定义

/*
 * Class that turns AS3 code into tokens.
 */
export class Lexer
{
  /*
   * @param {UString} source
   * @param {Number} length
   * @param {ErrorHandler} errorHandler
   */
  constructor(source, length, errorHandler)
  {
    this.source = source;
    this.length = length;
    this.index = 0;
    this.lineStart = 0;
    this.lineNumber = 1;
    this.comments = [];

    this.errorHandler = errorHandler;

    this.previousToken = null;
    this.token         = null;
    this.lookahead     = null;

    this._special = [];
  }

  /*
   * Verifies the end of file.
   */
  eof()
  {
    return this.index >= this.length;
  }

  /*
   * Advance the previous, current and lookahead tokens.
   * The lexer however does not depend on these tokens.
   */
  next()
  {
    this.previousToken = this.token;
    this.token         = this.lookahead;
    this.lookahead     = this.lex();
  }

  /*
   * Consumes the next token and return it.
   */
  lex()
  {
    this.consumeWhiteSpaces();

    while (this.consumeComment())
      this.consumeWhiteSpaces();

    let cp = this.source.codePointAt(this.index);

    let pureIdentifier =
      Character.isIdentifierStart(cp);

    if (pureIdentifier || (cp === 0x5C))
      return this.scanIdentifierOrKeyword(!pureIdentifier);

    if (this.eof())
    {
      let loc = [ this.index, this.lineNumber ];
      return new Token(TokenType.EOF, loc, loc, '<end>');
    }
  }

  /*
   * Scan an identifier, keyword or boolean literal.
   */
  scanIdentifierOrKeyword(usingEscape)
  {
    const start = this.index;
    let id;

    /* Like Esprima does: only identifiers containing
     * escapes need some overheads. */
    if (usingEscape)
    {
      id = this.getEscapedIdentifier(
        String.fromCodePoint(this.scanUnicodeEscapeSequence()));
    }
    else
      id = this.getCommonIdentifier();

    return new Token(
      TokenType.IDENTIFIER,
      [ start     , this.lineNumber ],
      [ this.index, this.lineNumber ],
      id
    );
  }

  /*
   * Interprets an identifier. If any escape appears, switches to
   * getEscapedIdentifier().
   */
  getCommonIdentifier()
  {
    const start = this.source.position.offset;
    let cp = 0;

    // Jump the starting symbol.
    ++this.index;

    while (!this.eof())
    {
      cp = this.source.codePointAt(this.index);

      if (Character.isIdentifierPart(cp))
        ++this.index;

      // Switches to escape-minded task...
      else if (cp === 0x5C)
        return this.getUnicodeEscapedIdentifier(
          this.source.string.slice(
            start, this.source.position.offset
          )
        );

      else break;
    }
    return this.source.string.slice(
      start, this.source.position.offset
    );
  }

  /* ... */
}
'use strict';

/*
 * String wrapper with methods _based_ on code points.
 */
export class UString
{
  /*
   * Constructs the {UString}.
   *
   * @param {String} s String to be wrapped.
   */
  constructor(s)
  {
    /*
     * @type {String}
     */
    this.string = s;

    /*
     * Tracks the last accessed position.
     *
     * @type {UStringPos}
     */
    this.position = new UStringPos(0, 0);
  }

  /*
   * Reads a code point at specific index.
   *
   * @param {Number} index
   * @return {Number}
   */
  codePointAt(index)
  {
    this.position.walk(this.string, index);
    return this.string.codePointAt(this.position.offset);
  }

  /*
   * Slices the internal string by code point indices.
   *
   * @param {Number} i
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, i);
    i = this.position.offset;

    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }
};

/*
 * Class that tracks the position of a code point on a string.
 */
export class UStringPos
{
  /*
   * Constructs the {UStringPos}.
   *
   * @param {Number} index The initial index.
   * @param {Number} offset The initial offset.
   */
  constructor(index, offset)
  {
    /*
     * @type {Number}
     */
    this.index = index;

    /*
     * @type {Number}
     */
    this.offset = offset;
  }

  /*
   * Walks to the given index.
   *
   * @param {String} s
   * @param {Number} index
   * @note No backward. Track the previous position instead.
   * @return {void}
   */
  walk(s, index)
  {
    for (; this.index < index; ++this.index)
      this.offset += (
        this._usingSurrogates(
          s.charCodeAt(this.offset)
        ) ? 2 : 1
      );
  }

  /*
   * @private
   */
  _usingSurrogates(ch)
  {
    return (ch >= 0xD800) && (ch <= 0xDBFF);
  }
};
/*
*类,该类将AS3代码转换为令牌。
*/
导出类Lexer
{
/*
*@param{UString}源
*@param{Number}长度
*@param{ErrorHandler}ErrorHandler
*/
构造函数(源、长度、errorHandler)
{
this.source=源;
这个长度=长度;
该指数=0;
this.lineStart=0;
此参数为.lineNumber=1;
this.comments=[];
this.errorHandler=errorHandler;
this.previousToken=null;
this.token=null;
this.lookahead=null;
这个。_special=[];
}
/*
*验证文件的结尾。
*/
eof()
{
返回this.index>=this.length;
}
/*
*推进上一个、当前和前瞻令牌。
*然而,lexer并不依赖于这些令牌。
*/
下一个()
{
this.previousToken=this.token;
this.token=this.lookahead;
this.lookahead=this.lex();
}
/*
*使用下一个令牌并返回它。
*/
法()
{
这个函数使用空格();
而(this.consumercomment())
这个函数使用空格();
设cp=this.source.codePointAt(this.index);
让pureIdentifier=
字符。isIdentifierStart(cp);
if(pureIdentifier | |(cp==0x5C))
返回此.ScanIdentifierRokeyWord(!pureIdentifier);
if(this.eof())
{
设loc=[this.index,this.lineNumber];
返回新令牌(TokenType.EOF,loc,loc“”);
}
}
/*
*扫描标识符、关键字或布尔文字。
*/
扫描仪识别器RKEYWORD(usingEscape)
{
const start=this.index;
让我看看你的身份证;
/*与Esprima一样:只有包含
*逃跑需要一些开销*/
if(usingEscape)
{
id=this.getEscapeIdentifier(
String.fromCodePoint(this.scannecDeeScapeSequence());
}
其他的
id=this.getCommonIdentifier();
返回新令牌(
TokenType.IDENTIFIER,
[开始,this.lineNumber],
[this.index,this.lineNumber],
身份证件
);
}
/*
*解释标识符。如果出现任何转义,则切换到
*GetEscapeIdentifier()。
*/
getCommonIdentifier()
{
const start=this.source.position.offset;
设cp=0;
//跳转起始符号。
++这个指数;
而(!this.eof())
{
cp=this.source.codePointAt(this.index);
if(字符isIdentifierPart(cp))
++这个指数;
//切换到逃避任务。。。
else if(cp==0x5C)
返回此.GetUnicodeScapeDidentifier(
this.source.string.slice(
开始,this.source.position.offset
)
);
否则就断了;
}
返回this.source.string.slice(
开始,this.source.position.offset
);
}
/* ... */
}
utils/UString.js

/*
 * Class that turns AS3 code into tokens.
 */
export class Lexer
{
  /*
   * @param {UString} source
   * @param {Number} length
   * @param {ErrorHandler} errorHandler
   */
  constructor(source, length, errorHandler)
  {
    this.source = source;
    this.length = length;
    this.index = 0;
    this.lineStart = 0;
    this.lineNumber = 1;
    this.comments = [];

    this.errorHandler = errorHandler;

    this.previousToken = null;
    this.token         = null;
    this.lookahead     = null;

    this._special = [];
  }

  /*
   * Verifies the end of file.
   */
  eof()
  {
    return this.index >= this.length;
  }

  /*
   * Advance the previous, current and lookahead tokens.
   * The lexer however does not depend on these tokens.
   */
  next()
  {
    this.previousToken = this.token;
    this.token         = this.lookahead;
    this.lookahead     = this.lex();
  }

  /*
   * Consumes the next token and return it.
   */
  lex()
  {
    this.consumeWhiteSpaces();

    while (this.consumeComment())
      this.consumeWhiteSpaces();

    let cp = this.source.codePointAt(this.index);

    let pureIdentifier =
      Character.isIdentifierStart(cp);

    if (pureIdentifier || (cp === 0x5C))
      return this.scanIdentifierOrKeyword(!pureIdentifier);

    if (this.eof())
    {
      let loc = [ this.index, this.lineNumber ];
      return new Token(TokenType.EOF, loc, loc, '<end>');
    }
  }

  /*
   * Scan an identifier, keyword or boolean literal.
   */
  scanIdentifierOrKeyword(usingEscape)
  {
    const start = this.index;
    let id;

    /* Like Esprima does: only identifiers containing
     * escapes need some overheads. */
    if (usingEscape)
    {
      id = this.getEscapedIdentifier(
        String.fromCodePoint(this.scanUnicodeEscapeSequence()));
    }
    else
      id = this.getCommonIdentifier();

    return new Token(
      TokenType.IDENTIFIER,
      [ start     , this.lineNumber ],
      [ this.index, this.lineNumber ],
      id
    );
  }

  /*
   * Interprets an identifier. If any escape appears, switches to
   * getEscapedIdentifier().
   */
  getCommonIdentifier()
  {
    const start = this.source.position.offset;
    let cp = 0;

    // Jump the starting symbol.
    ++this.index;

    while (!this.eof())
    {
      cp = this.source.codePointAt(this.index);

      if (Character.isIdentifierPart(cp))
        ++this.index;

      // Switches to escape-minded task...
      else if (cp === 0x5C)
        return this.getUnicodeEscapedIdentifier(
          this.source.string.slice(
            start, this.source.position.offset
          )
        );

      else break;
    }
    return this.source.string.slice(
      start, this.source.position.offset
    );
  }

  /* ... */
}
'use strict';

/*
 * String wrapper with methods _based_ on code points.
 */
export class UString
{
  /*
   * Constructs the {UString}.
   *
   * @param {String} s String to be wrapped.
   */
  constructor(s)
  {
    /*
     * @type {String}
     */
    this.string = s;

    /*
     * Tracks the last accessed position.
     *
     * @type {UStringPos}
     */
    this.position = new UStringPos(0, 0);
  }

  /*
   * Reads a code point at specific index.
   *
   * @param {Number} index
   * @return {Number}
   */
  codePointAt(index)
  {
    this.position.walk(this.string, index);
    return this.string.codePointAt(this.position.offset);
  }

  /*
   * Slices the internal string by code point indices.
   *
   * @param {Number} i
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, i);
    i = this.position.offset;

    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }
};

/*
 * Class that tracks the position of a code point on a string.
 */
export class UStringPos
{
  /*
   * Constructs the {UStringPos}.
   *
   * @param {Number} index The initial index.
   * @param {Number} offset The initial offset.
   */
  constructor(index, offset)
  {
    /*
     * @type {Number}
     */
    this.index = index;

    /*
     * @type {Number}
     */
    this.offset = offset;
  }

  /*
   * Walks to the given index.
   *
   * @param {String} s
   * @param {Number} index
   * @note No backward. Track the previous position instead.
   * @return {void}
   */
  walk(s, index)
  {
    for (; this.index < index; ++this.index)
      this.offset += (
        this._usingSurrogates(
          s.charCodeAt(this.offset)
        ) ? 2 : 1
      );
  }

  /*
   * @private
   */
  _usingSurrogates(ch)
  {
    return (ch >= 0xD800) && (ch <= 0xDBFF);
  }
};
“严格使用”;
/*
*基于代码点的方法的字符串包装器。
*/
出口类贸易
{
/*
*构造{UString}。
*
*@param{String}要包装的字符串。
*/
建造商
{
/*
*@type{String}
*/
this.string=s;
/*
*跟踪最后访问的位置。
*
*@type{UStringPos}
*/
this.position=新的UStringPos(0,0);
}
/*
*读取特定索引处的代码点。
*
*@param{Number}索引
*@return{Number}
*/
代码点(索引)
{
this.position.walk(this.string,index);
返回this.string.codepoint(this.position.offset);
}
/*
*按代码点索引对内部字符串进行切片。
*
*@param{Number}i
*@param{Number}j
*@return{String}
*/
切片(i,j)
{
this.position.walk(this.string,i);
i=此.position.offset;
this.position.walk(this.string,j);
j=此.position.offset;
返回这个.string.slice(i,j);
}
};
/*
*类,该类跟踪字符串上代码点的位置。
*/
出口类UStringPos
{
/*
*构造{UStringPos}。
*
*@param{Number}索引初始索引。
*@param{Number}偏移初始偏移量。
*/
构造函数(索引、偏移)
{
/*
*@type{Number}
*/
这个指数=指数;
/*
*@type{Number}
*/
这个偏移量=偏移量;
}
/*
*走到给定的索引。
*
*@param{String}s
*@param{Number}索引
*@注意不要后退。改为跟踪上一个位置。
*@return{void}
*/
步行(s,索引)
{
for(;this.indexreturn(ch>=0xD800)和&(ch好的。所以这是一个
这个.source.position.offset
的问题:当我做
++this.index
时,我的
UStringPos
的偏移量没有更新。问题是切片的问题

    this.source.string.slice(
      start, this.source.position.offset
    );
这个片段基于偏移量,因为我必须跟踪标识符开始的前一个偏移量

解决方案

我可以使用自己的
UString
类的切片,并将第一个参数用作偏移量,最后一个参数用作普通索引

'use strict';

export class UString
{
  // ...

  /*
   * Slices the internal string by using a pair of
   * offset and code point indices.
   *
   * @param {Number} i Offset
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }

};