(* This file defines the parser. *) %{ open MIPSOps open LPP open Primitive %} (* ------------------------------------------------------------------------- *) (* This is the start symbol of the grammar. *) %start <LPP.program> program (* ------------------------------------------------------------------------- *) (* These tokens are produced by the lexer and exploited by the parser. *) %token <int32> INTCONST %token <bool> BOOLCONST %token <string Location.t> ID %token PLUS MINUS TIMES SLASH AND OR NOT LT LE GT GE EQ NE %token LPAREN RPAREN LBRACKET RBRACKET COMMA COLONEQ SEMICOLON COLON DOT %token PROGRAM BEGIN END IF THEN ELSE WHILE DO PROCEDURE FUNCTION VAR %token NEW READLN WRITE WRITELN %token INTEGER BOOLEAN ARRAY OF (* ------------------------------------------------------------------------- *) (* Now come the precedence levels. These directives are used to resolve a number of shift/reduce and reduce/reduce conflicts. *) (* The tokens with lower precedence come first in this list. In other words, the fact that [OR] comes before [AND] in the list means that [OR] has lower precedence than [AND]. *) (* The fact that [OR] has lower precedence than [AND] means that the condition [x or y and z] is interpreted as [x or (y and z)]. In other words, in this case, a shift/reduce conflict is resolved in favor of shifting. *) (* The fact that [OR] is declared to be left-associative means that [x or y or z] is interpreted as [(x or y) or z]. In other words, in this case, a shift/reduce conflict is resolved in favor of reduction. *) (* The fact that [LT] is declared to be non-associative means that [x < y < z] is rejected: it causes a syntax error. *) (* The fact that [PLUS] has lower precedence than [LBRACKET] means that the expression [x + y[z]] is interpreted as [x + (y[z])], as desired. *) (* The instruction [if c1 then if c2 then i1 else i2] involves a classic shift/reduce conflict, known as the dangling-else conflict. The conflict occurs when the token [ELSE] is discovered. At this point, reducing the production [instruction -> IF condition THEN instruction] leads to interpreting this instruction as [if c1 then (if c2 then i1) else i2], while shifting the token [ELSE] leads to interpreting it as [if c1 then (if c2 then i1 else i2)]. The desired behavior is the latter, so we must resolve the conflict in favor of shifting. By default, the precedence level associated with reducing the above production is the level of the token [THEN]. (This convention is explained in Menhir's manual. It is inherited from yacc.) So, we give [THEN] a lower precedence level than [ELSE]. This is done by the last two lines in the declarations that follow. *) (* The declaration [%nonassoc unary_minus] makes [unary_minus] a pseudo-token: it is not an actual token, but only a name for a certain precedence level. This name is later used in an annotation of the form [%prec unary_minus], which assigns a precedence level to the reduction of the production [expression -> MINUS expression]. The fact that [unary_minus] has greater precedence than [PLUS], for instance, means that [-x+y] is correctly interpreted as [(-x)+y]. *) (* When applicable, this mechanism for defining precedence levels is extremely concise: in this grammar, for instance, more than a hundred conflicts are resolved by the ten lines that follow. *) %left OR %left AND %nonassoc NOT %nonassoc LT LE GT GE EQ NE %left MINUS PLUS %left TIMES SLASH %nonassoc unary_minus %nonassoc LBRACKET %nonassoc THEN %nonassoc ELSE %% (* The productions of the grammar follow. *) (* ------------------------------------------------------------------------- *) (* Types. *) typ: INTEGER { TypInt } BOOLEAN { TypBool } ARRAY OF t = typ { TypArray t } (* ------------------------------------------------------------------------- *) (* Expressions. *) (* Formal and actual parameter lists are delimited with parentheses and separated with semicolons and commas, respectively. *) expression: e = raw_expression { Location.make $startpos $endpos e } LPAREN e = expression RPAREN { e } raw_expression: i = INTCONST { EConst (ConstInt i) } b = BOOLCONST { EConst (ConstBool b) } id = ID { EGetVar id } MINUS e = expression %prec unary_minus { EUnOp (UOpNeg, e) } e1 = expression op = binop e2 = expression { EBinOp (op, e1, e2) } c = callee LPAREN actuals = separated_list(COMMA, expression) RPAREN { EFunCall (c, actuals) } a = expression LBRACKET i = expression RBRACKET { EArrayGet (a, i) } NEW ARRAY OF t = typ LBRACKET e = expression RBRACKET { EArrayAlloc (t, e) } (* Binary operators are defined as a separate nonterminal symbol for enhanced readability. However, note that this symbol is declared %inline, which means that its definition is in fact textually expanded into the definition of expressions above. This is necessary for operator priorities to be properly taken into account. *) %inline binop: PLUS { OpAdd } MINUS { OpSub } TIMES { OpMul } SLASH { OpDiv } LT { OpLt } LE { OpLe } GT { OpGt } GE { OpGe } EQ { OpEq } NE { OpNe } (* ------------------------------------------------------------------------- *) (* Conditions. We first define ``nontrivial'' conditions, which cannot simply be an expression or a parenthesized expression: they must involve at least one of the Boolean operators NOT, AND, OR. Then, we define a condition to be either a (Boolean-valued) expression or a nontrivial condition. This two-step approach is necessary to avoid ambiguity with parentheses. Indeed, conditions include expressions, and both conditions and expressions can be parenthesized, so a naïve approach would yield an ambiguous grammar. Thanks to Yannick Moy for suggesting this solution. *) nontrivial_condition: NOT c = condition { CNot c } c1 = condition AND c2 = condition { CAnd (c1, c2) } c1 = condition OR c2 = condition { COr (c1, c2) } LPAREN c = nontrivial_condition RPAREN { c } condition: e = expression { CExpression e } c = nontrivial_condition { c } (* ------------------------------------------------------------------------- *) (* Instructions and blocks. *) instruction: b = block { b } id = ID COLONEQ e = expression { ISetVar (id, e) } callee = callee LPAREN actuals = separated_list(COMMA, expression) RPAREN { IProcCall (callee, actuals) } a = expression LBRACKET i = expression RBRACKET COLONEQ e = expression { IArraySet (a, i, e) } IF c = condition THEN b = instruction { IIf (c, b, ISeq []) } IF c = condition THEN b1 = instruction ELSE b2 = instruction { IIf (c, b1, b2) } WHILE c = condition DO b = instruction { IWhile (c, b) } block: BEGIN is = separated_list(SEMICOLON, instruction) END { ISeq is } (* ------------------------------------------------------------------------- *) (* Callees. *) %inline callee: c = raw_callee { Location.make $startpos $endpos c } raw_callee: WRITE { CPrimitiveFunction Write } WRITELN { CPrimitiveFunction Writeln } READLN { CPrimitiveFunction Readln } id = ID { CUserFunction (Location.content id) } (* ------------------------------------------------------------------------- *) (* Procedures and functions. *) (* Procedures and functions are very similar, so we are able to express a single semantic action that covers both productions. In the case of procedures, the variable [result] is bound to the value [None] via a simple trick: we recognize the non-terminal symbol [no_result_type], whose language is empty. *) procedure: FUNCTION f = ID LPAREN formals = separated_bindings RPAREN COLON result = some_result_type SEMICOLON locals = variables body = block SEMICOLON PROCEDURE f = ID LPAREN formals = separated_bindings RPAREN result = no_result_type SEMICOLON locals = variables body = block SEMICOLON { f, { formals = formals; result = result; locals = locals; body = body } } %inline no_result_type: (* nothing *) { None } %inline some_result_type: t = typ { Some t } separated_bindings: bindings = separated_list(SEMICOLON, binding) (* list can be empty *) { List.flatten bindings } terminated_bindings: bindings = terminated(binding, SEMICOLON)+ (* list is nonempty *) { List.flatten bindings } binding: ids = separated_nonempty_list(COMMA, ID) COLON t = typ { List.map (fun id -> (id, t)) ids } variables: vars = loption(preceded(VAR, terminated_bindings)) { vars } (* ------------------------------------------------------------------------- *) (* Programs. *) program: PROGRAM globals = variables defs = procedure* main = block DOT {{ globals = globals; defs = defs; main = main }}