lexer grammar PreprocessorLexer; /* * Author: Stephen F. Siegel, University of Delaware * Last changed: June 2012 * * This is a grammar for lexical analysis for a preprocessor * file. It follows the C11 Standard. This grammar assumes * that the stream of characters being scanned has already * gone through translation phases 1 and 2. In particular * backslash followed by newline sequences have been removed. * * This lexer grammar will not contain C keywords and ones for * CIVL-C, ACSL, GNU and CUDA extensions of C. * Those keywords are defined in * dev.civl.abc.front.c.parse.PP2CivlcTokenCConverter * A private function named as `initCKeywordMap` shall identify * and set tokens in preprocessed streams as its corresponding * token-types. */ @header { package dev.civl.abc.front.c.preproc; } @members { /* Are we currently parsing ACSL annotations? If yes, the comments that begin with '@' will be parsed as a sequence of ordinary preprocessor tokens. If no, they will be parsed as ordinary comments, i.e., as a single token consisting of one big string. This option is controled by the presence of the #pragma CIVL ACSL in the source file. */ public boolean parseAnnotations = false; /* States in a DFS looking for "#pragma CIVL ACSL" which informs the lexer to start scanning annotations as preprocessor tokens rather than as one big comment. Start state: 1. 0: waiting for NEWLINE: anything other than NEWLINE self loops. on NEWLINE goto 1. 1: waiting for #: whitespace self-loops, on # goto 2. anything else: goto 0. 2: waiting for pragma: non-NEWLINE white space self-loops. on NEWLINE: goto 1. on pragma: goto 3. on anything else: goto 0 3: waiting for CIVL: non-NEWLINE white space self-loops. on NEWLINE: goto 1. on CIVL: goto 4 on anything else: goto 0 4: waiting for ACSL: non-NEWLINE white space self-loops. on NEWLINE: goto 1 on ACSL: BINGO. set parseAnnotations to true. Goto 0. on anything else: goto 0. */ private int annoteState = 1; @Override public void emitErrorMessage(String msg) { // don't try to recover! throw new RuntimeException(msg); } @Override public void emit(Token token) { if (parseAnnotations && token.getType() == COMMENT) { String text = token.getText(); if ("/*@".equals(text)) token.setType(ANNOTATION_START); else if ("//@".equals(text)) token.setType(INLINE_ANNOTATION_START); } super.emit(token); //System.out.println("Token: "+token); // DEBUGGING.... } /* Looks for the sequence #pragma CIVL ACSL. As soon as that is detected sets parseAnnotations to true. This causes annotations to be parsed as preprocessor tokens rather than as text (as a normal comment would be). */ @Override public Token nextToken() { Token token = super.nextToken(); if (parseAnnotations) return token; int type = token.getType(); switch (annoteState) { case 0: if (type == NEWLINE) annoteState = 1; break; case 1: // at beginning of line. this is the start state. if (type == HASH) annoteState = 2; else if (type != NEWLINE && type != WS) annoteState = 0; break; case 2: if (type == NEWLINE) annoteState = 1; else if (type == PRAGMA) annoteState = 3; else if (type != WS) annoteState = 0; break; case 3: if (type == NEWLINE) annoteState = 1; else if (type == IDENTIFIER && "CIVL".equals(token.getText().toUpperCase())) annoteState = 4; else if (type != WS) annoteState = 0; break; case 4: if (type == NEWLINE) annoteState = 1; else if (type == IDENTIFIER && "ACSL".equals(token.getText().toUpperCase())) { parseAnnotations = true; //System.out.println("PARSING ANNOTATIONS NOW."); } else if (type != WS) annoteState = 0; break; default: assert false; // unreachable } return token; } } /****** White space ******/ NEWLINE : '\r'? '\n' ; WS : ' ' | '\t' ; /* Words that are used in both C and the preprocessor */ IF : 'if' ; ELSE : 'else' ; /* Words used in preprocessor but not in C */ DEFINE : 'define' ; DEFINED : 'defined' ; ELIF : 'elif' ; ENDIF : 'endif' ; ERROR : 'error' ; IFDEF : 'ifdef' ; IFNDEF : 'ifndef' ; INCLUDE : 'include' ; LINE : 'line' ; PRAGMA : 'pragma' ; UNDEF : 'undef' ; /****** Punctuators: C11 Sec. 6.4.6 ******/ ELLIPSIS : '...' ; DOTDOT : '..' ; DOT : '.' ; AMPERSAND : '&' ; AND : '&&' ; ARROW : '->' ; ASSIGN : '=' ; BITANDEQ : '&=' ; BITOR : '|' ; BITOREQ : '|=' ; BITXOR : '^' ; BITXOREQ : '^=' ; COLON : ':' ; COMMA : ',' ; DIV : '/' ; DIVEQ : '/=' ; EQUALS : '==' ; GT : '>' ; GTE : '>=' ; HASH : '#' | '%:' ; HASHHASH : '##' | '%:%:' ; LCURLY : '{' | '<%' ; LPAREN : '(' ; LSQUARE : '[' | '<:' ; LT : '<' ; LTE : '<=' ; MINUSMINUS : '--' ; MOD : '%' ; MODEQ : '%=' ; NEQ : '!=' ; NOT : '!' ; OR : '||' ; PLUS : '+' ; PLUSEQ : '+=' ; PLUSPLUS : '++' ; QMARK : '?' ; RCURLY : '}' | '%>' ; RPAREN : ')' ; RSQUARE : ']' | ':>' ; SEMI : ';' ; SHIFTLEFT : '<<' ; SHIFTLEFTEQ : '<<=' ; SHIFTRIGHT : '>>' ; SHIFTRIGHTEQ : '>>=' ; STAR : '*' ; STAREQ : '*=' ; SUB : '-' ; SUBEQ : '-=' ; TILDE : '~' ; /* CIVL-C and ACSL Punctuators */ AT : '@' ; EQUIV_ACSL : '<==>' ; IMPLIES : '=>' ; IMPLIES_ACSL : '==>' ; // LSLIST and RSLIST enclose a scope list LSLIST : '<|' ; RSLIST : '|>' ; XOR_ACSL : '^^' ; /* CUDA Punctuators */ LEXCON : '<<<' ; REXCON : '>>>' ; /****** Identifiers: C11 Sec. 6.4.2 ******/ IDENTIFIER : IdentifierNonDigit (IdentifierNonDigit | Digit)* ; fragment IdentifierNonDigit : NonDigit | UniversalCharacterName ; fragment Zero : '0' ; fragment Digit : Zero | NonZeroDigit ; fragment NonZeroDigit : '1' .. '9' ; fragment NonDigit : 'A'..'Z' | 'a'..'z' | '_' | '$'; fragment UniversalCharacterName : '\\' 'u' HexQuad | '\\' 'U' HexQuad HexQuad ; fragment HexQuad : HexadecimalDigit HexadecimalDigit HexadecimalDigit HexadecimalDigit ; fragment HexadecimalDigit : '0'..'9' | 'a'..'f' | 'A'..'F' ; /****** Sec. 6.4.4.1: Integer constants ******/ INTEGER_CONSTANT : DecimalConstant IntegerSuffix? | OctalConstant IntegerSuffix? | HexadecimalConstant IntegerSuffix? ; fragment DecimalConstant : NonZeroDigit Digit* ; fragment IntegerSuffix : UnsignedSuffix LongSuffix? | UnsignedSuffix LongLongSuffix | LongSuffix UnsignedSuffix? | LongLongSuffix UnsignedSuffix? ; fragment UnsignedSuffix : 'u' | 'U' ; fragment LongSuffix : 'l' | 'L' ; fragment LongLongSuffix : 'll' | 'LL' ; fragment OctalConstant : Zero OctalDigit* IntegerSuffix? ; fragment HexadecimalConstant : HexPrefix HexadecimalDigit+ IntegerSuffix? ; fragment HexPrefix : Zero ('x' | 'X') ; /****** Sec. 6.4.4.2: Floating Constants ******/ FLOATING_CONSTANT : DecimalFloatingConstant | HexadecimalFloatingConstant ; fragment DecimalFloatingConstant : FractionalConstant ExponentPart? FloatingSuffix? | Digit+ ExponentPart FloatingSuffix? ; fragment FractionalConstant : Digit* DOT Digit+ | Digit+ DOT ; fragment ExponentPart : ('e' | 'E') ('+' | '-')? Digit+ ; fragment FloatingSuffix : 'f' | 'l' | 'F' | 'L' ; fragment HexadecimalFloatingConstant : HexPrefix HexFractionalConstant BinaryExponentPart FloatingSuffix? | HexPrefix HexadecimalDigit+ BinaryExponentPart FloatingSuffix? ; fragment HexFractionalConstant : HexadecimalDigit* DOT HexadecimalDigit+ | HexadecimalDigit+ DOT ; fragment BinaryExponentPart : ('p' | 'P') ('+' | '-')? Digit+ ; /****** Preprocessing Numbers: C11 Sec 6.4.8 ******/ /* PP_NUMBER should be anything that doesn't match the previous * rules but does match this one. */ PP_NUMBER : '.'? Digit ( '.' | IdentifierNonDigit | Digit | ('e' | 'E' | 'p' | 'P') ('+' | '-') )* ; /****** Sec. 6.4.4.4: Character Constants ******/ CHARACTER_CONSTANT : ('L' | 'U' | 'u')? '\'' CChar+ '\'' ; fragment CChar : ~('\'' | '\\' | '\n') | EscapeSequence ; fragment EscapeSequence : '\\' ( '\'' | '"' | '\?' | '\\' | 'a' | 'b' | 'f' | 'n' |'r' | 't' | 'v' ) | OctalEscape | HexEscape ; fragment OctalEscape : '\\' OctalDigit (OctalDigit OctalDigit?)? ; fragment OctalDigit : '0' .. '7'; fragment HexEscape : '\\' 'x' HexadecimalDigit+ ; /****** 6.4.5: String Literals *****/ STRING_LITERAL : ('u8' | 'u' | 'U' | 'L')? '"' SChar* '"' ; fragment SChar : ~('"' | '\\' | '\n') | EscapeSequence ; /* ***** Comments: C11 Sec 6.4.9 ******/ fragment INLINE_COMMENT : '//' INLINE_COMMENT_TAIL ; fragment INLINE_COMMENT_TAIL : NEWLINE | EOF | ~('@' | '\n' | '\r') ( options {greedy=true;} : ~('\n'|'\r') )* | {!parseAnnotations}?=> '@' ( options {greedy=true;} : ~('\n'|'\r') )* | {parseAnnotations}?=> '@' ; // the following rule is never activated but no problem, we capture the token // in INLINE_COMMENT and then change the token type in emit()... INLINE_ANNOTATION_START : '//@' ; // the following is not quite perfect because in the case of the \n or \r // immediately following the // it counts that white space as part of the // comment, otherwise it doesn't. Would like to make the \n or \r NOT // part of the comment always, but how --- need to look ahead one character? fragment BLOCK_COMMENT : '/*' BLOCK_COMMENT_TAIL ; fragment BLOCK_COMMENT_TAIL : '*/' | ~('@') ( options {greedy=false;} : . )* '*/' | {!parseAnnotations}?=> '@' ( options {greedy=false;} : . )* '*/' | {parseAnnotations}?=> '@' ; COMMENT : INLINE_COMMENT | BLOCK_COMMENT ; // For some reason, ANNNOTATION_START is never invoked. No problem, // we will catch it on emit as a COMMENT and change its type. ANNOTATION_START : {parseAnnotations}?=> '/*' '@' ; ANNOTATION_END : {parseAnnotations}?=> '*/' ; /* Special keywords starting with backslash reserved for extensions * such as ACSL */ EXTENDED_IDENTIFIER : '\\' IdentifierNonDigit (IdentifierNonDigit | Digit)* ; /****** Other characters: C11 Sec. 6.4 ******/ OTHER : . ;