| [aad342c] | 1 | lexer grammar PreprocessorLexer;
|
|---|
| 2 |
|
|---|
| 3 | /*
|
|---|
| 4 | * Author: Stephen F. Siegel, University of Delaware
|
|---|
| 5 | * Last changed: June 2012
|
|---|
| 6 | *
|
|---|
| 7 | * This is a grammar for lexical analysis for a preprocessor
|
|---|
| 8 | * file. It follows the C11 Standard. This grammar assumes
|
|---|
| 9 | * that the stream of characters being scanned has already
|
|---|
| 10 | * gone through translation phases 1 and 2. In particular
|
|---|
| 11 | * backslash followed by newline sequences have been removed.
|
|---|
| 12 | *
|
|---|
| 13 | * This lexer grammar will not contain C keywords and ones for
|
|---|
| 14 | * CIVL-C, ACSL, GNU and CUDA extensions of C.
|
|---|
| 15 | * Those keywords are defined in
|
|---|
| 16 | * dev.civl.abc.front.c.parse.PP2CivlcTokenCConverter
|
|---|
| 17 | * A private function named as `initCKeywordMap` shall identify
|
|---|
| [3ce9d20] | 18 | * and set tokens in preprocessed streams as its corresponding
|
|---|
| [aad342c] | 19 | * token-types.
|
|---|
| 20 | */
|
|---|
| 21 |
|
|---|
| 22 | @header
|
|---|
| 23 | {
|
|---|
| 24 | package dev.civl.abc.front.c.preproc;
|
|---|
| 25 | }
|
|---|
| 26 |
|
|---|
| 27 | @members
|
|---|
| 28 | {
|
|---|
| [3ce9d20] | 29 |
|
|---|
| 30 | /* Are we currently parsing ACSL annotations? If yes, the comments that
|
|---|
| 31 | begin with '@' will be parsed as a sequence of ordinary preprocessor
|
|---|
| 32 | tokens. If no, they will be parsed as ordinary comments, i.e., as a
|
|---|
| 33 | single token consisting of one big string. This option is controled
|
|---|
| 34 | by the presence of the #pragma CIVL ACSL in the source file. */
|
|---|
| 35 | public boolean parseAnnotations = false;
|
|---|
| 36 |
|
|---|
| 37 | /* States in a DFS looking for "#pragma CIVL ACSL" which informs the
|
|---|
| 38 | lexer to start scanning annotations as preprocessor tokens rather
|
|---|
| 39 | than as one big comment. Start state: 1.
|
|---|
| 40 |
|
|---|
| 41 | 0: waiting for NEWLINE: anything other than NEWLINE self loops.
|
|---|
| 42 | on NEWLINE goto 1.
|
|---|
| 43 | 1: waiting for #: whitespace self-loops,
|
|---|
| 44 | on # goto 2.
|
|---|
| 45 | anything else: goto 0.
|
|---|
| 46 | 2: waiting for pragma: non-NEWLINE white space self-loops.
|
|---|
| 47 | on NEWLINE: goto 1.
|
|---|
| 48 | on pragma: goto 3.
|
|---|
| 49 | on anything else: goto 0
|
|---|
| 50 | 3: waiting for CIVL: non-NEWLINE white space self-loops.
|
|---|
| 51 | on NEWLINE: goto 1.
|
|---|
| 52 | on CIVL: goto 4
|
|---|
| 53 | on anything else: goto 0
|
|---|
| 54 | 4: waiting for ACSL: non-NEWLINE white space self-loops.
|
|---|
| 55 | on NEWLINE: goto 1
|
|---|
| 56 | on ACSL: BINGO. set parseAnnotations to true. Goto 0.
|
|---|
| 57 | on anything else: goto 0.
|
|---|
| 58 | */
|
|---|
| 59 | private int annoteState = 1;
|
|---|
| 60 |
|
|---|
| [aad342c] | 61 | @Override
|
|---|
| 62 | public void emitErrorMessage(String msg) { // don't try to recover!
|
|---|
| 63 | throw new RuntimeException(msg);
|
|---|
| 64 | }
|
|---|
| [3ce9d20] | 65 |
|
|---|
| 66 | @Override
|
|---|
| 67 | public void emit(Token token) {
|
|---|
| 68 | if (parseAnnotations && token.getType() == COMMENT) {
|
|---|
| 69 | String text = token.getText();
|
|---|
| 70 | if ("/*@".equals(text))
|
|---|
| 71 | token.setType(ANNOTATION_START);
|
|---|
| 72 | else if ("//@".equals(text))
|
|---|
| 73 | token.setType(INLINE_ANNOTATION_START);
|
|---|
| 74 | }
|
|---|
| 75 | super.emit(token);
|
|---|
| 76 | //System.out.println("Token: "+token); // DEBUGGING....
|
|---|
| 77 | }
|
|---|
| 78 |
|
|---|
| 79 | /* Looks for the sequence #pragma CIVL ACSL. As soon as that is detected
|
|---|
| 80 | sets parseAnnotations to true. This causes annotations to be parsed
|
|---|
| 81 | as preprocessor tokens rather than as text (as a normal comment would be).
|
|---|
| 82 | */
|
|---|
| 83 | @Override
|
|---|
| 84 | public Token nextToken() {
|
|---|
| 85 | Token token = super.nextToken();
|
|---|
| 86 | if (parseAnnotations)
|
|---|
| 87 | return token;
|
|---|
| 88 | int type = token.getType();
|
|---|
| 89 | switch (annoteState) {
|
|---|
| 90 | case 0:
|
|---|
| 91 | if (type == NEWLINE) annoteState = 1;
|
|---|
| 92 | break;
|
|---|
| 93 | case 1: // at beginning of line. this is the start state.
|
|---|
| 94 | if (type == HASH) annoteState = 2;
|
|---|
| 95 | else if (type != NEWLINE && type != WS) annoteState = 0;
|
|---|
| 96 | break;
|
|---|
| 97 | case 2:
|
|---|
| 98 | if (type == NEWLINE) annoteState = 1;
|
|---|
| 99 | else if (type == PRAGMA) annoteState = 3;
|
|---|
| 100 | else if (type != WS) annoteState = 0;
|
|---|
| 101 | break;
|
|---|
| 102 | case 3:
|
|---|
| 103 | if (type == NEWLINE) annoteState = 1;
|
|---|
| 104 | else if (type == IDENTIFIER &&
|
|---|
| 105 | "CIVL".equals(token.getText().toUpperCase()))
|
|---|
| 106 | annoteState = 4;
|
|---|
| 107 | else if (type != WS) annoteState = 0;
|
|---|
| 108 | break;
|
|---|
| 109 | case 4:
|
|---|
| 110 | if (type == NEWLINE) annoteState = 1;
|
|---|
| 111 | else if (type == IDENTIFIER &&
|
|---|
| 112 | "ACSL".equals(token.getText().toUpperCase())) {
|
|---|
| 113 | parseAnnotations = true;
|
|---|
| 114 | //System.out.println("PARSING ANNOTATIONS NOW.");
|
|---|
| 115 | }
|
|---|
| 116 | else if (type != WS) annoteState = 0;
|
|---|
| 117 | break;
|
|---|
| 118 | default:
|
|---|
| 119 | assert false; // unreachable
|
|---|
| 120 | }
|
|---|
| 121 | return token;
|
|---|
| 122 | }
|
|---|
| 123 |
|
|---|
| [aad342c] | 124 | }
|
|---|
| 125 |
|
|---|
| 126 | /****** White space ******/
|
|---|
| 127 | NEWLINE : '\r'? '\n' ;
|
|---|
| 128 | WS : ' ' | '\t' ;
|
|---|
| 129 |
|
|---|
| 130 | /* Words that are used in both C and the preprocessor */
|
|---|
| 131 | IF : 'if' ;
|
|---|
| 132 | ELSE : 'else' ;
|
|---|
| 133 |
|
|---|
| 134 | /* Words used in preprocessor but not in C */
|
|---|
| 135 | DEFINE : 'define' ;
|
|---|
| 136 | DEFINED : 'defined' ;
|
|---|
| 137 | ELIF : 'elif' ;
|
|---|
| 138 | ENDIF : 'endif' ;
|
|---|
| 139 | ERROR : 'error' ;
|
|---|
| 140 | IFDEF : 'ifdef' ;
|
|---|
| 141 | IFNDEF : 'ifndef' ;
|
|---|
| 142 | INCLUDE : 'include' ;
|
|---|
| 143 | LINE : 'line' ;
|
|---|
| 144 | PRAGMA : 'pragma' ;
|
|---|
| 145 | UNDEF : 'undef' ;
|
|---|
| 146 |
|
|---|
| 147 | /****** Punctuators: C11 Sec. 6.4.6 ******/
|
|---|
| 148 | ELLIPSIS : '...' ;
|
|---|
| 149 | DOTDOT : '..' ;
|
|---|
| 150 | DOT : '.' ;
|
|---|
| 151 | AMPERSAND : '&' ;
|
|---|
| 152 | AND : '&&' ;
|
|---|
| 153 | ARROW : '->' ;
|
|---|
| 154 | ASSIGN : '=' ;
|
|---|
| 155 | BITANDEQ : '&=' ;
|
|---|
| 156 | BITOR : '|' ;
|
|---|
| 157 | BITOREQ : '|=' ;
|
|---|
| 158 | BITXOR : '^' ;
|
|---|
| 159 | BITXOREQ : '^=' ;
|
|---|
| 160 | COLON : ':' ;
|
|---|
| 161 | COMMA : ',' ;
|
|---|
| 162 | DIV : '/' ;
|
|---|
| 163 | DIVEQ : '/=' ;
|
|---|
| 164 | EQUALS : '==' ;
|
|---|
| 165 | GT : '>' ;
|
|---|
| 166 | GTE : '>=' ;
|
|---|
| 167 | HASH : '#' | '%:' ;
|
|---|
| 168 | HASHHASH : '##' | '%:%:' ;
|
|---|
| 169 | LCURLY : '{' | '<%' ;
|
|---|
| 170 | LPAREN : '(' ;
|
|---|
| 171 | LSQUARE : '[' | '<:' ;
|
|---|
| 172 | LT : '<' ;
|
|---|
| 173 | LTE : '<=' ;
|
|---|
| 174 | MINUSMINUS : '--' ;
|
|---|
| 175 | MOD : '%' ;
|
|---|
| 176 | MODEQ : '%=' ;
|
|---|
| 177 | NEQ : '!=' ;
|
|---|
| 178 | NOT : '!' ;
|
|---|
| 179 | OR : '||' ;
|
|---|
| 180 | PLUS : '+' ;
|
|---|
| 181 | PLUSEQ : '+=' ;
|
|---|
| 182 | PLUSPLUS : '++' ;
|
|---|
| 183 | QMARK : '?' ;
|
|---|
| 184 | RCURLY : '}' | '%>' ;
|
|---|
| 185 | RPAREN : ')' ;
|
|---|
| 186 | RSQUARE : ']' | ':>' ;
|
|---|
| 187 | SEMI : ';' ;
|
|---|
| 188 | SHIFTLEFT : '<<' ;
|
|---|
| 189 | SHIFTLEFTEQ : '<<=' ;
|
|---|
| 190 | SHIFTRIGHT : '>>' ;
|
|---|
| 191 | SHIFTRIGHTEQ : '>>=' ;
|
|---|
| 192 | STAR : '*' ;
|
|---|
| 193 | STAREQ : '*=' ;
|
|---|
| 194 | SUB : '-' ;
|
|---|
| 195 | SUBEQ : '-=' ;
|
|---|
| 196 | TILDE : '~' ;
|
|---|
| 197 |
|
|---|
| 198 | /* CIVL-C and ACSL Punctuators */
|
|---|
| [3ce9d20] | 199 |
|
|---|
| 200 |
|
|---|
| [aad342c] | 201 | AT : '@' ;
|
|---|
| 202 | EQUIV_ACSL : '<==>' ;
|
|---|
| 203 | IMPLIES : '=>' ;
|
|---|
| 204 | IMPLIES_ACSL : '==>' ;
|
|---|
| 205 | // LSLIST and RSLIST enclose a scope list
|
|---|
| 206 | LSLIST : '<|' ;
|
|---|
| 207 | RSLIST : '|>' ;
|
|---|
| 208 | XOR_ACSL : '^^' ;
|
|---|
| 209 |
|
|---|
| 210 | /* CUDA Punctuators */
|
|---|
| 211 | LEXCON : '<<<' ;
|
|---|
| 212 | REXCON : '>>>' ;
|
|---|
| 213 |
|
|---|
| [3ce9d20] | 214 |
|
|---|
| [aad342c] | 215 | /****** Identifiers: C11 Sec. 6.4.2 ******/
|
|---|
| 216 | IDENTIFIER : IdentifierNonDigit
|
|---|
| 217 | (IdentifierNonDigit | Digit)*
|
|---|
| 218 | ;
|
|---|
| 219 |
|
|---|
| 220 | fragment
|
|---|
| 221 | IdentifierNonDigit
|
|---|
| 222 | : NonDigit | UniversalCharacterName ;
|
|---|
| 223 |
|
|---|
| 224 | fragment
|
|---|
| 225 | Zero : '0' ;
|
|---|
| 226 |
|
|---|
| 227 | fragment
|
|---|
| 228 | Digit : Zero | NonZeroDigit ;
|
|---|
| 229 |
|
|---|
| 230 | fragment
|
|---|
| 231 | NonZeroDigit : '1' .. '9' ;
|
|---|
| 232 |
|
|---|
| 233 | fragment
|
|---|
| 234 | NonDigit : 'A'..'Z' | 'a'..'z' | '_' | '$';
|
|---|
| 235 |
|
|---|
| 236 | fragment
|
|---|
| 237 | UniversalCharacterName
|
|---|
| 238 | : '\\' 'u' HexQuad
|
|---|
| 239 | | '\\' 'U' HexQuad HexQuad
|
|---|
| 240 | ;
|
|---|
| 241 |
|
|---|
| 242 | fragment
|
|---|
| 243 | HexQuad : HexadecimalDigit HexadecimalDigit HexadecimalDigit HexadecimalDigit ;
|
|---|
| 244 |
|
|---|
| 245 | fragment
|
|---|
| 246 | HexadecimalDigit
|
|---|
| 247 | : '0'..'9' | 'a'..'f' | 'A'..'F' ;
|
|---|
| 248 |
|
|---|
| 249 | /****** Sec. 6.4.4.1: Integer constants ******/
|
|---|
| 250 | INTEGER_CONSTANT
|
|---|
| 251 | : DecimalConstant IntegerSuffix?
|
|---|
| 252 | | OctalConstant IntegerSuffix?
|
|---|
| 253 | | HexadecimalConstant IntegerSuffix?
|
|---|
| 254 | ;
|
|---|
| 255 |
|
|---|
| 256 | fragment
|
|---|
| 257 | DecimalConstant : NonZeroDigit Digit* ;
|
|---|
| 258 |
|
|---|
| 259 |
|
|---|
| 260 | fragment
|
|---|
| 261 | IntegerSuffix : UnsignedSuffix LongSuffix?
|
|---|
| 262 | | UnsignedSuffix LongLongSuffix
|
|---|
| 263 | | LongSuffix UnsignedSuffix?
|
|---|
| 264 | | LongLongSuffix UnsignedSuffix?
|
|---|
| 265 | ;
|
|---|
| 266 |
|
|---|
| 267 | fragment
|
|---|
| 268 | UnsignedSuffix : 'u' | 'U' ;
|
|---|
| 269 |
|
|---|
| 270 | fragment
|
|---|
| 271 | LongSuffix : 'l' | 'L' ;
|
|---|
| 272 |
|
|---|
| 273 | fragment
|
|---|
| 274 | LongLongSuffix : 'll' | 'LL' ;
|
|---|
| 275 |
|
|---|
| 276 | fragment
|
|---|
| 277 | OctalConstant : Zero OctalDigit* IntegerSuffix? ;
|
|---|
| 278 |
|
|---|
| 279 | fragment
|
|---|
| 280 | HexadecimalConstant
|
|---|
| 281 | : HexPrefix HexadecimalDigit+ IntegerSuffix? ;
|
|---|
| 282 |
|
|---|
| 283 | fragment
|
|---|
| 284 | HexPrefix : Zero ('x' | 'X') ;
|
|---|
| 285 |
|
|---|
| 286 | /****** Sec. 6.4.4.2: Floating Constants ******/
|
|---|
| 287 |
|
|---|
| 288 | FLOATING_CONSTANT
|
|---|
| 289 | : DecimalFloatingConstant
|
|---|
| 290 | | HexadecimalFloatingConstant
|
|---|
| 291 | ;
|
|---|
| 292 |
|
|---|
| 293 | fragment
|
|---|
| 294 | DecimalFloatingConstant
|
|---|
| 295 | : FractionalConstant ExponentPart? FloatingSuffix?
|
|---|
| 296 | | Digit+ ExponentPart FloatingSuffix?
|
|---|
| 297 | ;
|
|---|
| 298 |
|
|---|
| 299 | fragment
|
|---|
| 300 | FractionalConstant
|
|---|
| 301 | : Digit* DOT Digit+
|
|---|
| 302 | | Digit+ DOT
|
|---|
| 303 | ;
|
|---|
| 304 |
|
|---|
| 305 | fragment
|
|---|
| 306 | ExponentPart : ('e' | 'E') ('+' | '-')? Digit+ ;
|
|---|
| 307 |
|
|---|
| 308 | fragment
|
|---|
| 309 | FloatingSuffix : 'f' | 'l' | 'F' | 'L' ;
|
|---|
| 310 |
|
|---|
| 311 | fragment
|
|---|
| 312 | HexadecimalFloatingConstant
|
|---|
| 313 | : HexPrefix HexFractionalConstant BinaryExponentPart
|
|---|
| 314 | FloatingSuffix?
|
|---|
| 315 | | HexPrefix HexadecimalDigit+ BinaryExponentPart
|
|---|
| 316 | FloatingSuffix?
|
|---|
| 317 | ;
|
|---|
| 318 |
|
|---|
| 319 | fragment
|
|---|
| 320 | HexFractionalConstant
|
|---|
| 321 | : HexadecimalDigit* DOT HexadecimalDigit+
|
|---|
| 322 | | HexadecimalDigit+ DOT
|
|---|
| 323 | ;
|
|---|
| 324 |
|
|---|
| 325 | fragment
|
|---|
| 326 | BinaryExponentPart
|
|---|
| 327 | : ('p' | 'P') ('+' | '-')? Digit+ ;
|
|---|
| 328 |
|
|---|
| 329 |
|
|---|
| 330 | /****** Preprocessing Numbers: C11 Sec 6.4.8 ******/
|
|---|
| 331 |
|
|---|
| 332 | /* PP_NUMBER should be anything that doesn't match the previous
|
|---|
| 333 | * rules but does match this one.
|
|---|
| 334 | */
|
|---|
| 335 | PP_NUMBER : '.'? Digit
|
|---|
| 336 | ( '.'
|
|---|
| 337 | | IdentifierNonDigit
|
|---|
| 338 | | Digit
|
|---|
| 339 | | ('e' | 'E' | 'p' | 'P') ('+' | '-')
|
|---|
| 340 | )*
|
|---|
| 341 | ;
|
|---|
| 342 |
|
|---|
| 343 |
|
|---|
| 344 | /****** Sec. 6.4.4.4: Character Constants ******/
|
|---|
| 345 |
|
|---|
| 346 | CHARACTER_CONSTANT
|
|---|
| 347 | : ('L' | 'U' | 'u')? '\'' CChar+ '\'' ;
|
|---|
| 348 |
|
|---|
| 349 | fragment
|
|---|
| 350 | CChar : ~('\'' | '\\' | '\n') | EscapeSequence ;
|
|---|
| 351 |
|
|---|
| 352 | fragment
|
|---|
| 353 | EscapeSequence : '\\' ( '\'' | '"' | '\?' | '\\' |
|
|---|
| 354 | 'a' | 'b' | 'f' | 'n' |'r' | 't' | 'v'
|
|---|
| 355 | )
|
|---|
| 356 | | OctalEscape
|
|---|
| 357 | | HexEscape
|
|---|
| 358 | ;
|
|---|
| 359 | fragment
|
|---|
| 360 | OctalEscape : '\\' OctalDigit (OctalDigit OctalDigit?)? ;
|
|---|
| 361 |
|
|---|
| 362 | fragment
|
|---|
| 363 | OctalDigit : '0' .. '7';
|
|---|
| 364 |
|
|---|
| 365 | fragment
|
|---|
| 366 | HexEscape : '\\' 'x' HexadecimalDigit+ ;
|
|---|
| 367 |
|
|---|
| 368 |
|
|---|
| 369 | /****** 6.4.5: String Literals *****/
|
|---|
| 370 |
|
|---|
| 371 |
|
|---|
| 372 | STRING_LITERAL : ('u8' | 'u' | 'U' | 'L')? '"' SChar* '"'
|
|---|
| 373 | ;
|
|---|
| 374 |
|
|---|
| 375 | fragment
|
|---|
| 376 | SChar : ~('"' | '\\' | '\n') | EscapeSequence ;
|
|---|
| 377 |
|
|---|
| 378 |
|
|---|
| 379 |
|
|---|
| 380 | /* ***** Comments: C11 Sec 6.4.9 ******/
|
|---|
| 381 |
|
|---|
| [3ce9d20] | 382 | fragment
|
|---|
| 383 | INLINE_COMMENT : '//' INLINE_COMMENT_TAIL ;
|
|---|
| 384 |
|
|---|
| 385 | fragment
|
|---|
| 386 | INLINE_COMMENT_TAIL
|
|---|
| 387 | : NEWLINE
|
|---|
| 388 | | EOF
|
|---|
| 389 | | ~('@' | '\n' | '\r') ( options {greedy=true;} : ~('\n'|'\r') )*
|
|---|
| 390 | | {!parseAnnotations}?=> '@' ( options {greedy=true;} : ~('\n'|'\r') )*
|
|---|
| 391 | | {parseAnnotations}?=> '@'
|
|---|
| 392 | ;
|
|---|
| 393 |
|
|---|
| 394 | // the following rule is never activated but no problem, we capture the token
|
|---|
| 395 | // in INLINE_COMMENT and then change the token type in emit()...
|
|---|
| 396 | INLINE_ANNOTATION_START : '//@' ;
|
|---|
| 397 |
|
|---|
| [aad342c] | 398 | // the following is not quite perfect because in the case of the \n or \r
|
|---|
| 399 | // immediately following the // it counts that white space as part of the
|
|---|
| 400 | // comment, otherwise it doesn't. Would like to make the \n or \r NOT
|
|---|
| 401 | // part of the comment always, but how --- need to look ahead one character?
|
|---|
| 402 |
|
|---|
| 403 | fragment
|
|---|
| [3ce9d20] | 404 | BLOCK_COMMENT : '/*' BLOCK_COMMENT_TAIL ;
|
|---|
| 405 |
|
|---|
| 406 | fragment BLOCK_COMMENT_TAIL
|
|---|
| 407 | : '*/'
|
|---|
| 408 | | ~('@') ( options {greedy=false;} : . )* '*/'
|
|---|
| 409 | | {!parseAnnotations}?=> '@' ( options {greedy=false;} : . )* '*/'
|
|---|
| 410 | | {parseAnnotations}?=> '@'
|
|---|
| 411 | ;
|
|---|
| [aad342c] | 412 |
|
|---|
| 413 | COMMENT : INLINE_COMMENT | BLOCK_COMMENT ;
|
|---|
| 414 |
|
|---|
| [3ce9d20] | 415 | // For some reason, ANNNOTATION_START is never invoked. No problem,
|
|---|
| 416 | // we will catch it on emit as a COMMENT and change its type.
|
|---|
| 417 | ANNOTATION_START : {parseAnnotations}?=> '/*' '@' ;
|
|---|
| 418 | ANNOTATION_END : {parseAnnotations}?=> '*/' ;
|
|---|
| 419 |
|
|---|
| 420 |
|
|---|
| [aad342c] | 421 | /* Special keywords starting with backslash reserved for extensions
|
|---|
| 422 | * such as ACSL */
|
|---|
| 423 | EXTENDED_IDENTIFIER
|
|---|
| 424 | :
|
|---|
| 425 | '\\' IdentifierNonDigit (IdentifierNonDigit | Digit)*
|
|---|
| 426 | ;
|
|---|
| 427 |
|
|---|
| 428 | /****** Other characters: C11 Sec. 6.4 ******/
|
|---|
| 429 | OTHER : . ;
|
|---|