Parsing OCaml：如何在没有堆栈的LL解析期间构造AST_Parsing_Ocaml_Abstract Syntax Tree

Parsing OCaml：如何在没有堆栈的LL解析期间构造AST

parsing ocaml

Parsing OCaml：如何在没有堆栈的LL解析期间构造AST,parsing,ocaml,abstract-syntax-tree,Parsing,Ocaml,Abstract Syntax Tree,我为LL1语法编写了一个预测解析器。每个非终端A都有一个对应的parseA方法，该方法接收一个令牌列表，并返回令牌列表的其余部分和一个解析树我不知道在解析器中调用哪个AST方法。有没有一个通用的方法来解决这个问题？这是我的尝试：例如，我语法中的一个小节： expr -> t eprime eprime -> PLUS t eprime | MINUS t eprime | ε t -> t tprime tprime -> TIMES f tprime | DIV

我为LL1语法编写了一个预测解析器。每个非终端

都有一个对应的

parseA

方法，该方法接收一个令牌列表，并返回令牌列表的其余部分和一个解析树

我不知道在解析器中调用哪个AST方法。有没有一个通用的方法来解决这个问题？

这是我的尝试：

例如，我语法中的一个小节：

expr -> t eprime 
eprime -> PLUS t eprime | MINUS t eprime | ε
t -> t tprime
tprime -> TIMES f tprime | DIVIDE f tprime | ε
f -> LPAREN expr RPAREN | LITERAL | TRUE | FALSE | ID

我有四个解析方法，每个非终结符对应一个

let parseExpr tokenlist =
    match tokenlist.head with 
    | "LPAREN" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "LITERAL" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "TRUE" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "FALSE" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "ID" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))


let parseEPrime tokenlist =
  match tokenlist with
   | "PLUS" -> let expr_t tokenlist_t = next tokenlist |> parseT in
                let expr_eprime tokenlist_e = parseEPrime tokenlist_t in 
                (tokenlist_e, Ast.Add(expr_t, expr_eprime))
   | "MINUS" -> let expr_t tokenlist_t = next tokenlist |> parseT in
                let expr_eprime tokenlist_e = parseEPrime tokenlist_t in 
                (tokenlist_e, Ast.Minus(expr_t, expr_eprime))
   | "SEMI" -> (tokenlist, [])
   | "RPAREN" -> (tokenlist, [])
   | _ -> raise error  


let parseT tokenlist = 
  match tokenlist.lookathead with 
  | "LPAREN" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "LITERAL" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.Literal(expr_f, expr_tprime))
  | "TRUE" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "FALSE" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "ID" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | _-> raise error

let parseTprime tokenlist = 
  match  tokenlist.lookathead with
  | "TIMES" -> let expr_f tokenlist_f = next tokenlist |> parseF in 
                let expr_tprime tokenlist_tprime = parseTPrime tokenlist_f in 
                (tokenlist_tprime, Ast.Times(expr_f, expr_tprime))
  | "DIVIDE" -> let expr_f tokenlist_f = next tokenlist |> parseF in 
                let expr_tprime tokenlist_tprime = parseTPrime tokenlist_f in 
                (tokenlist_tprime, Ast.Divide(expr_f, expr_tprime))
  | "PLUS" -> (tokenlist, [])
  | "MINUS" -> (tokenlist, [])
  | "SEMI" -> (tokenlist, [])
  | "RPAREN" -> (tokenlist, [])
  | _ -> raise error  

let parseF tokenlist = 
  match tokenlist.lookathead with
  | "LPAREN" -> let expr tokenlist_expr = next tokenlist |> parseE in 
                match next tokenlist_expr with 
                | "RPAREN" -> (next tokenlist_expr, Ast.ExpressionParen(expr))
  | "LITERAL" -> (next tokenlist, Ast.FLiteral)
  | "TRUE" -> (next tokenlist, Ast.BoolLit)
  | "FALSE" -> (next tokenlist, Ast.FBool)
  | "ID" -> (next tokenlist, Ast.Id)
  | _ -> raise error

(*expr -> T E* *)
type expr = 
| Expression of t eprime 


(*T -> F T*)
type t = 
| F of f * tprime

(*E* -> + T E* 
E* -> - T E* 
E* -> ε  *)
type eprime = 
| Add of t eprime
| Minus of t eprime
| Eempty


(*T* -> TIMES F T* 
T* -> / F T* 
T* -> ε*)
type tprime = 
| Divide of f * tprime 
| Times of f * tprime
| TEmpty

(*F -> LPAREN E RPAREN 
F -> Literal 
F -> TRUE 
F -> FALSE
F -> ID*)
type f = 
| ExpressionParen of expr
| Literal of int 
| BoolLit of bool 
| Id of string

正如您可能从我的代码中看到的，我为每个非终结符编写了一个类型，然后为该非终结符的每个产品都编写了一个方法

let parseExpr tokenlist =
    match tokenlist.head with 
    | "LPAREN" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "LITERAL" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "TRUE" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "FALSE" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))
    | "ID" -> let t_expr tokenlist_t = next tokenlist |> parseExpr in 
                  let e_expr tokenlist_e = parseEPrime tokenlist_t in
                  (tokenlist_e, Ast.Expression(t_expr, e_expr))


let parseEPrime tokenlist =
  match tokenlist with
   | "PLUS" -> let expr_t tokenlist_t = next tokenlist |> parseT in
                let expr_eprime tokenlist_e = parseEPrime tokenlist_t in 
                (tokenlist_e, Ast.Add(expr_t, expr_eprime))
   | "MINUS" -> let expr_t tokenlist_t = next tokenlist |> parseT in
                let expr_eprime tokenlist_e = parseEPrime tokenlist_t in 
                (tokenlist_e, Ast.Minus(expr_t, expr_eprime))
   | "SEMI" -> (tokenlist, [])
   | "RPAREN" -> (tokenlist, [])
   | _ -> raise error  


let parseT tokenlist = 
  match tokenlist.lookathead with 
  | "LPAREN" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "LITERAL" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.Literal(expr_f, expr_tprime))
  | "TRUE" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "FALSE" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | "ID" -> let expr_f tokenlist_f = parseF tokenlist in 
                let expr_tprime tokenlist_tprime = parseTprime tokenlist_f in 
                (tokenlist_tprime, Ast.F(expr_f, expr_tprime))
  | _-> raise error

let parseTprime tokenlist = 
  match  tokenlist.lookathead with
  | "TIMES" -> let expr_f tokenlist_f = next tokenlist |> parseF in 
                let expr_tprime tokenlist_tprime = parseTPrime tokenlist_f in 
                (tokenlist_tprime, Ast.Times(expr_f, expr_tprime))
  | "DIVIDE" -> let expr_f tokenlist_f = next tokenlist |> parseF in 
                let expr_tprime tokenlist_tprime = parseTPrime tokenlist_f in 
                (tokenlist_tprime, Ast.Divide(expr_f, expr_tprime))
  | "PLUS" -> (tokenlist, [])
  | "MINUS" -> (tokenlist, [])
  | "SEMI" -> (tokenlist, [])
  | "RPAREN" -> (tokenlist, [])
  | _ -> raise error  

let parseF tokenlist = 
  match tokenlist.lookathead with
  | "LPAREN" -> let expr tokenlist_expr = next tokenlist |> parseE in 
                match next tokenlist_expr with 
                | "RPAREN" -> (next tokenlist_expr, Ast.ExpressionParen(expr))
  | "LITERAL" -> (next tokenlist, Ast.FLiteral)
  | "TRUE" -> (next tokenlist, Ast.BoolLit)
  | "FALSE" -> (next tokenlist, Ast.FBool)
  | "ID" -> (next tokenlist, Ast.Id)
  | _ -> raise error

(*expr -> T E* *)
type expr = 
| Expression of t eprime 


(*T -> F T*)
type t = 
| F of f * tprime

(*E* -> + T E* 
E* -> - T E* 
E* -> ε  *)
type eprime = 
| Add of t eprime
| Minus of t eprime
| Eempty


(*T* -> TIMES F T* 
T* -> / F T* 
T* -> ε*)
type tprime = 
| Divide of f * tprime 
| Times of f * tprime
| TEmpty

(*F -> LPAREN E RPAREN 
F -> Literal 
F -> TRUE 
F -> FALSE
F -> ID*)
type f = 
| ExpressionParen of expr
| Literal of int 
| BoolLit of bool 
| Id of string

但我不知道我的方法保留了太多不必要的信息，而不是AST通常会抖掉的信息（我想象AST是一个解析树，它会抖掉并去掉不必要的叶子）。到目前为止，我只留下了括号和分号。恐怕我的AST中有

类型t、类型f、类型tprome、类型eprome

太多了。但是如果我删除它们，我就不知道如何在AST中编写

类型expr

。

如果每个非终结符都有一个类型，那么最终得到的树将更具体（类似于解析树），而不是抽象树

我不知道这有多糟糕，它仍然是代码的良好表示

从一个角度来看，你的语法是如此简单和精简，以至于没有太多需要省略的偶然标点符号来使树更加抽象

您可能可以统一表达式和术语的类型。换句话说，表达式树只能使用一种内部节点类型。一旦在解析过程中对先行项进行了排序，表达式和术语都是一系列子表达式，它们之间有运算符。

给定一个定义如下的AST：

type expr =
  | Add of expr * expr
  | Minus of expr * expr
  | Times of expr * expr
  | Divide of expr * expr
  | IntLit of int 
  | BoolLit of bool 
  | Id of string

通过使

Prime

函数将左操作数作为参数，可以调整解析函数以返回这样的AST，如下所示：

let parseExpr tokens =
  let (lhs, remainingTokens) = parseT tokens in
  parseExprPrime lhs remainingTokens

let parseExprPrime lhs tokens = match tokenlist.lookahead with
| PLUS :: tokens ->
  let (rhs, remainingTokens) = parseT (next tokens) in
  parseExprPrime (Add (lhs, rhs)) remainingTokens
| MINUS :: tokens ->
  let (rhs, remainingTokens) = parseT (next tokens) in
  parseExprPrime (Minus (lhs, rhs)) remainingTokens
| tokens ->
  lhs, tokens

let rec eval = function
| Expression (lhs, eprime) -> evalEPrime (evalT lhs) eprime

and evalEPrime lhsValue = function
| Add (rhs, rest) -> evalEPrime (lhsValue + evalT rhs) rest
| Minus (rhs, rest) -> evalEPrime (lhsValue - evalT rhs) rest
| Eempty -> lhsValue

and evalT = function
| T (lhs, tprime) -> evalTPrime (evalF lhs) tprime

and evalTPrime lhsValue = function
| Times (rhs, rest) -> evalTPrime (lhsValue * evalF rhs) rest
| Divide (rhs, rest) -> evalTPrime (lhsValue / evalF rhs) rest
| TEmpty -> lhsValue

and evalF = function
| ExpressionParen expr -> eval expr
| IntLit i -> i

parseT

和

parsetprome

看起来是一样的（当然除了乘法和除法），而

parseF

几乎保持原样，除了

Ast.ExpressionParen（expr）

将只是

expr

，因为我还从AST定义中删除了

ExpressionParen

案例

请注意，这里没有必要区分合法令牌和非法令牌。对于像

这样的合法代币，只需返回lhs和代币
就可以了或）
和非法代币。在后一种情况下，非法令牌最终将被调用解析器检测到——无需在多个位置检测错误。表达式规则也是如此：如果tokens
以非法标记开头，则parseF
将检测到该标记，因此无需在此处进行检查。同样的代码也不需要重复四次，因此您只需调用parseT
和parsexprprome
，而不必查看当前标记，这些函数就会处理它

至于是否简化AST这样的值是值得的——让我们考虑一个函数<代码> EVA:EXPR->INT/COD>作为一个案例研究（让我们忽略<代码> BoolLit < /代码>和<代码> ID >代码>为此目的）。使用原始定义，它将如下所示：
let parseExpr tokens =
  let (lhs, remainingTokens) = parseT tokens in
  parseExprPrime lhs remainingTokens

let parseExprPrime lhs tokens = match tokenlist.lookahead with
| PLUS :: tokens ->
  let (rhs, remainingTokens) = parseT (next tokens) in
  parseExprPrime (Add (lhs, rhs)) remainingTokens
| MINUS :: tokens ->
  let (rhs, remainingTokens) = parseT (next tokens) in
  parseExprPrime (Minus (lhs, rhs)) remainingTokens
| tokens ->
  lhs, tokens

let rec eval = function
| Expression (lhs, eprime) -> evalEPrime (evalT lhs) eprime

and evalEPrime lhsValue = function
| Add (rhs, rest) -> evalEPrime (lhsValue + evalT rhs) rest
| Minus (rhs, rest) -> evalEPrime (lhsValue - evalT rhs) rest
| Eempty -> lhsValue

and evalT = function
| T (lhs, tprime) -> evalTPrime (evalF lhs) tprime

and evalTPrime lhsValue = function
| Times (rhs, rest) -> evalTPrime (lhsValue * evalF rhs) rest
| Divide (rhs, rest) -> evalTPrime (lhsValue / evalF rhs) rest
| TEmpty -> lhsValue

and evalF = function
| ExpressionParen expr -> eval expr
| IntLit i -> i

使用简化定义，它将改为：
let rec eval = function
| Add (lhs, rhs) -> eval lhs + eval rhs
| Minus (lhs, rhs) -> eval lhs - eval rhs
| Times (lhs, rhs) -> eval lhs * eval rhs
| Divide (lhs, rhs) -> eval lhs / eval rhs
| IntLit i -> i

<> P> >我想简化版本肯定会改进AST的工作，我认为它是值得的。
我可以看到我如何为表达式树构造一个内部节点类型。然而，我不知道如何在子解析例程中返回子表达式列表，这样我就可以在parsexpr
中返回Ast.Expression（x，operator，y）
类型对（e，eprome）和（t，tprome）显然是相似的，并且您可能只需要对两者使用（e，eprome）（使用4个操作符而不是2个）。但你不必使用完全不同的结构。你仍然可以使用e*eprome的表达式
。我明白了——我想我是在问我如何仍然得到操作符——在我当前的结构中，如果我在eprome中看到Add，我就调用Ast.Add。如果我一直等到我从expr开始，然后从expr调用Ast.Expression（e，eprome）的所有产品，它如何不丢失操作数呢？首先，别误会，我认为您当前的代码很好。就像我说的，它不太具体，真的，只是语法非常精简。但要回答您的问题，只需假设您使用5个变量（加、减、除、倍、空）定义EPROM。然后你可以使用新的eprome而不是旧的eprome和tprome。你仍然可以像以前一样使用Ast.Add和Ast.Divide，现在它们都是eprome的构造函数。