parser grammar PreprocessorParser; /* Author: Stephen F. Siegel, University of Delaware * Last modified: July 15, 2016 * * Grammar for C preprocessor. * This grammar describes a C source file before preprocessing. * It does not execute any preprocessor directives. * It simply represents the file in a structured way. * * See the C11 Standard, Sec. 6.10. * * This grammar uses the PreprocessorLexer, which has already * formed the preprocessor tokens. * * Extensions from other languages (beyond C11) are included. */ // TODO: use things like this: // bar : ID{$ID.setText("HELLO");$ID.setType(0);} WS INT -> ID INT; options { tokenVocab=PreprocessorLexer; output=AST; } /* "imaginary" tokens that will be used in the tree */ tokens { FILE; // root node TEXT_BLOCK; // a list of tokens PARAMLIST; // x1,x2,x3 EXPR; // an expression used in a conditional (#if) SEQUENCE; // true branch of conditional directive BODY; // body of macro definition PIF; // preprocessor if: #if PELSE; // preprocessor else: #else PPRAGMA; // preprocessor pragma: #pragma /* C, CIVL, ACSL, and CUDA keywords */ AUTO; ASM; BREAK; CASE; CHAR; CONST; CONTINUE; DEFAULT; DO; DOUBLE; ENUM; EXTERN; FLOAT; FOR; GOTO; INLINE; INT; LONG; REGISTER; RESTRICT; RETURN; SHORT; SIGNED; SIZEOF; STATIC; STRUCT; SWITCH; TYPEDEF; UNION; UNSIGNED; VOID; VOLATILE; WHILE; ALIGNAS; ALIGNOF; ATOMIC; BOOL; COMPLEX; GENERIC; IMAGINARY; NORETURN; STATICASSERT; THREADLOCAL; /* */ ABSTRACT; ASSIGNS; BIG_O; CALLS; CATCH; CHOOSE; CIVLATOMIC; CIVLFOR; COLLECTIVE; //dummy CONTIN; DEPENDS; DERIV; DIFFERENTIABLE; DOMAIN; ENSURES; EXISTS; FORALL; FATOMIC; GUARD; HERE; INPUT; INVARIANT; LAMBDA; MEM_TYPE; OUTPUT; ORIGINAL; PARFOR; PROCNULL; PURE; RANGE; REAL; REQUIRES; RESULT; RUN; SCOPEOF; SELF; STATE_F; STATE_NULL; READS; SPAWN; SYSTEM; UNIFORM; UPDATE; VALUE_AT; WHEN; WITH; /* */ DEVICE; GLOBAL; SHARED; /* */ TYPEOF; } @header { package dev.civl.abc.front.c.preproc; } @members{ @Override public void emitErrorMessage(String msg) { // don't try to recover! throw new RuntimeException(msg); } } /* An item is either a preprocessor directive * or a text block. For compound directives, * such as #ifdef ... #endif, all of the text * between the opening if and the closing #endif * is considered part of the directive. * A textblock is a maximal sequence of plain * text lines. */ file : whiteBlock? itemList EOF -> ^(FILE whiteBlock? itemList EOF) ; /* items : directiveBlock* (textBlock directiveBlock+)* textBlock? ; */ /* starts with non-ws token # or something not # and ends just before * non-ws token that does not start a directive block or text block. */ itemList : directiveBlock itemList | textBlock ( directiveBlock itemList | ) | ; whiteBlock : white+ -> ^(TEXT_BLOCK white+) ; textBlock : textSegment+ -> ^(TEXT_BLOCK textSegment+) ; textSegment : NEWLINE white* | ~(HASH|WS|COMMENT|NEWLINE) (~NEWLINE)* NEWLINE white* ; directiveBlock : directive whiteBlock? ; directive : HASH! white!* directiveSuffix ; directiveSuffix : macrodef | macroundef | includeline | pragmaline | errorline | lineline | ifdefblock | ifblock | ifndefblock | nondirective ; /* A nondirective is any line starting with # that * doesn't fall into one of the ordinary directive * forms. */ nondirective : t+=not_directive t+=wpptoken* NEWLINE -> ^(HASH $t+) | NEWLINE -> ^(HASH) ; /* A function-like or object-like macro definition. */ macrodef : DEFINE white+ i=identifier ( paramlist macrobody -> ^(DEFINE $i paramlist macrobody) | NEWLINE -> ^(DEFINE $i ^(BODY)) | white macrobody -> ^(DEFINE $i macrobody) ) ; macrobody : white* ( t+=pptoken (t+=wpptoken* t+=pptoken)? white* NEWLINE -> ^(BODY $t+) | NEWLINE -> ^(BODY) ) ; paramlist : LPAREN white* ( RPAREN -> ^(PARAMLIST) | ELLIPSIS white* RPAREN -> ^(PARAMLIST ELLIPSIS) | identifier (white* COMMA white* identifier)* white* ( RPAREN -> ^(PARAMLIST identifier+) | COMMA white* ELLIPSIS white* RPAREN -> ^(PARAMLIST identifier+ ELLIPSIS) ) ) ; macroundef : UNDEF white+ identifier white* NEWLINE -> ^(UNDEF identifier) ; includeline : INCLUDE white* t+=pptoken (t+=wpptoken* t+=pptoken)? white* NEWLINE -> ^(INCLUDE $t+) ; pragmaline : PRAGMA{$PRAGMA.setType(PPRAGMA);} wpptoken* NEWLINE -> ^(PRAGMA wpptoken* NEWLINE) ; errorline : ERROR wpptoken* NEWLINE -> ^(ERROR wpptoken*) ; lineline : LINE wpptoken* NEWLINE -> ^(LINE wpptoken*) ; /* #ifdef X ... #elif ... #elif ... #else ... #endif. * Tree: * (IFDEF identifier ^(SEQUENCE item*)), or * (IFDEF identifier ^(SEQUENCE item*) elseblock) */ ifdefblock : IFDEF white* i=identifier white* NEWLINE t=if_section f=if_suffix -> ^(IFDEF $i ^(SEQUENCE $t?) $f?) ; /* Exactly like above, except with #ifndef instead of #ifdef */ ifndefblock : IFNDEF white* i=identifier white* NEWLINE t=if_section f=if_suffix -> ^(IFNDEF $i ^(SEQUENCE $t?) $f?) ; /* #if expr ... #elif ... #elif ... #else ... #endif. * Very similar to #ifdef, but with an expression in place * of an identifier. */ ifblock : IF{$IF.setType(PIF);} white* e=expr white* NEWLINE t=if_section f=if_suffix -> ^(IF $e ^(SEQUENCE $t?) $f?) ; /* A section of a conditional directive. * Begins just after the line containing * one of #ifdef, #ifndef, #if, #elif, * or #else. * Ends with the HASH white* * immediately preceding the first matching * endif, elif, or else. */ if_section : whiteBlock? section_body ; /* Begins with first non-white token on a line inside a * conditional section, * ends with the HASH white* immediately preceding the * endif, elif, or else closing that section. * Tree is just flat * list of TEXT_BLOCKs and directives. */ section_body : textBlock? subsection ; /* Begins with a # at beginning of a line (after possible * white space) inside a conditional directive body. * Ends with the HASH white* immediately preceding * the closing endif, elif, or else. Tree is just * flat list of TEXT_BLOCKs and directives. */ subsection : HASH! white!* ( directiveSuffix whiteBlock? section_body)? ; /* Begins with endif, elif, or else. Ends with NEWLINE after * closing #endif. * Tree: one of * 1. empty * 2. (ELIF (ELIF expr (SEQUENCE items) elseblock?)) * 3. (ELSE items) * respectively. The reason for #2 is to make the tree * for a #elif... look the same as what would be obtained from * #else #if .... The first ELIF * should be interpreted as ELSE and the second as IF. */ if_suffix : ENDIF white* NEWLINE -> | c=ELIF white* expr white* NEWLINE if_section if_suffix -> ^($c ^($c expr ^(SEQUENCE if_section?) if_suffix?)) | ELSE{$ELSE.setType(PELSE);} white* NEWLINE if_section ENDIF white* NEWLINE -> ^(ELSE if_section?) ; /* A space, tab, or comment */ white : WS | COMMENT ; /* A preprocessor token or white space token (but not NEWLINE). */ wpptoken : pptoken | white ; /* An expression that can be used with #if or #elif. * This grammar will accept just about anything here. */ expr : ppdExpr (white* ppdExpr)* -> ^(EXPR ppdExpr+) ; definedExpr : DEFINED white!* ( identifier | LPAREN! white!* identifier white!* RPAREN! ) ; /* A preprocessor token or defined expressions. These are the * things that can occur in an #if or #elif directive: */ ppdExpr : (DEFINED)=> definedExpr | pptoken ; /* A "preprocessor token" as defined in the C11 Standard. * This rule includes all of the extensions from the other * languages too. We got rid of header names because * those are composed of smaller tokens in our lexer. */ pptoken : identifier | pp_number | CHARACTER_CONSTANT | STRING_LITERAL | punctuator | OTHER ; /* Any token that is not a preprocessor keyword */ not_directive : pp_number | CHARACTER_CONSTANT | STRING_LITERAL | punctuator | OTHER | IDENTIFIER | EXTENDED_IDENTIFIER ; /* An "identifier" for the preprocessor is an IDENTIFIER * or any of the reserved words from any of the languages */ identifier : IDENTIFIER | EXTENDED_IDENTIFIER | pp_keyword ; /* C and preprocessor keywords: */ /* Words that are used in both C and the preprocessor */ c_pp_keyword : IF | ELSE ; /* Words used in preprocessor but not in C */ pp_notc_keyword : DEFINE | DEFINED | ELIF | ENDIF | ERROR | IFDEF | IFNDEF | INCLUDE | LINE | PRAGMA | UNDEF ; /* Words used in preprocessor */ pp_keyword : pp_notc_keyword | c_pp_keyword ; /* a "pp_number" is any PP_NUMBER, INTEGER_CONSTANT, or FLOATING_CONSTANT */ pp_number : INTEGER_CONSTANT | FLOATING_CONSTANT | PP_NUMBER ; /* The punctuators are the symbols which are not words. * These are punctuators from all languages: */ punctuator : c_punctuator | civl_punctuator | cuda_punctuator ; /* C punctuators: */ c_punctuator : AMPERSAND | AND | ARROW | ASSIGN | BITANDEQ | BITOR | BITOREQ | BITXOR | BITXOREQ | COLON | COMMA | DIV | DIVEQ | ELLIPSIS | DOTDOT | DOT | EQUALS | GT | GTE | HASH | HASHHASH | LCURLY | LPAREN | LSQUARE | LT | LTE | MINUSMINUS | MOD | MODEQ | NEQ | NOT | OR | PLUS | PLUSEQ | PLUSPLUS | QMARK | RCURLY | RPAREN | RSQUARE | SEMI | SHIFTLEFT | SHIFTLEFTEQ | SHIFTRIGHT | SHIFTRIGHTEQ | STAR | STAREQ | SUB | SUBEQ | TILDE ; civl_punctuator : ANNOTATION_END | ANNOTATION_START | AT | EQUIV_ACSL | IMPLIES | IMPLIES_ACSL | INLINE_ANNOTATION_START | LSLIST | RSLIST | XOR_ACSL ; cuda_punctuator : LEXCON | REXCON ;