source: CIVL/mods/dev.civl.abc/grammar/c/PreprocessorLexer.g

main
Last change on this file was 3ce9d20, checked in by Stephen Siegel <siegel@…>, 19 months ago

Fixed a few preprocessing bugs, first dealing with the regular expression
needed to escape backslashes, second to control when annotations are scanned
as comments vs. ACSL.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5924 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 9.9 KB
Line 
1lexer grammar PreprocessorLexer;
2
3/*
4 * Author: Stephen F. Siegel, University of Delaware
5 * Last changed: June 2012
6 *
7 * This is a grammar for lexical analysis for a preprocessor
8 * file. It follows the C11 Standard. This grammar assumes
9 * that the stream of characters being scanned has already
10 * gone through translation phases 1 and 2. In particular
11 * backslash followed by newline sequences have been removed.
12 *
13 * This lexer grammar will not contain C keywords and ones for
14 * CIVL-C, ACSL, GNU and CUDA extensions of C.
15 * Those keywords are defined in
16 * dev.civl.abc.front.c.parse.PP2CivlcTokenCConverter
17 * A private function named as `initCKeywordMap` shall identify
18 * and set tokens in preprocessed streams as its corresponding
19 * token-types.
20 */
21
22@header
23{
24package dev.civl.abc.front.c.preproc;
25}
26
27@members
28{
29
30/* Are we currently parsing ACSL annotations? If yes, the comments that
31 begin with '@' will be parsed as a sequence of ordinary preprocessor
32 tokens. If no, they will be parsed as ordinary comments, i.e., as a
33 single token consisting of one big string. This option is controled
34 by the presence of the #pragma CIVL ACSL in the source file. */
35public boolean parseAnnotations = false;
36
37/* States in a DFS looking for "#pragma CIVL ACSL" which informs the
38 lexer to start scanning annotations as preprocessor tokens rather
39 than as one big comment. Start state: 1.
40
41 0: waiting for NEWLINE: anything other than NEWLINE self loops.
42 on NEWLINE goto 1.
43 1: waiting for #: whitespace self-loops,
44 on # goto 2.
45 anything else: goto 0.
46 2: waiting for pragma: non-NEWLINE white space self-loops.
47 on NEWLINE: goto 1.
48 on pragma: goto 3.
49 on anything else: goto 0
50 3: waiting for CIVL: non-NEWLINE white space self-loops.
51 on NEWLINE: goto 1.
52 on CIVL: goto 4
53 on anything else: goto 0
54 4: waiting for ACSL: non-NEWLINE white space self-loops.
55 on NEWLINE: goto 1
56 on ACSL: BINGO. set parseAnnotations to true. Goto 0.
57 on anything else: goto 0.
58 */
59private int annoteState = 1;
60
61@Override
62public void emitErrorMessage(String msg) { // don't try to recover!
63 throw new RuntimeException(msg);
64}
65
66@Override
67public void emit(Token token) {
68 if (parseAnnotations && token.getType() == COMMENT) {
69 String text = token.getText();
70 if ("/*@".equals(text))
71 token.setType(ANNOTATION_START);
72 else if ("//@".equals(text))
73 token.setType(INLINE_ANNOTATION_START);
74 }
75 super.emit(token);
76 //System.out.println("Token: "+token); // DEBUGGING....
77}
78
79/* Looks for the sequence #pragma CIVL ACSL. As soon as that is detected
80 sets parseAnnotations to true. This causes annotations to be parsed
81 as preprocessor tokens rather than as text (as a normal comment would be).
82 */
83@Override
84public Token nextToken() {
85 Token token = super.nextToken();
86 if (parseAnnotations)
87 return token;
88 int type = token.getType();
89 switch (annoteState) {
90 case 0:
91 if (type == NEWLINE) annoteState = 1;
92 break;
93 case 1: // at beginning of line. this is the start state.
94 if (type == HASH) annoteState = 2;
95 else if (type != NEWLINE && type != WS) annoteState = 0;
96 break;
97 case 2:
98 if (type == NEWLINE) annoteState = 1;
99 else if (type == PRAGMA) annoteState = 3;
100 else if (type != WS) annoteState = 0;
101 break;
102 case 3:
103 if (type == NEWLINE) annoteState = 1;
104 else if (type == IDENTIFIER &&
105 "CIVL".equals(token.getText().toUpperCase()))
106 annoteState = 4;
107 else if (type != WS) annoteState = 0;
108 break;
109 case 4:
110 if (type == NEWLINE) annoteState = 1;
111 else if (type == IDENTIFIER &&
112 "ACSL".equals(token.getText().toUpperCase())) {
113 parseAnnotations = true;
114 //System.out.println("PARSING ANNOTATIONS NOW.");
115 }
116 else if (type != WS) annoteState = 0;
117 break;
118 default:
119 assert false; // unreachable
120 }
121 return token;
122}
123
124}
125
126/****** White space ******/
127NEWLINE : '\r'? '\n' ;
128WS : ' ' | '\t' ;
129
130/* Words that are used in both C and the preprocessor */
131IF : 'if' ;
132ELSE : 'else' ;
133
134/* Words used in preprocessor but not in C */
135DEFINE : 'define' ;
136DEFINED : 'defined' ;
137ELIF : 'elif' ;
138ENDIF : 'endif' ;
139ERROR : 'error' ;
140IFDEF : 'ifdef' ;
141IFNDEF : 'ifndef' ;
142INCLUDE : 'include' ;
143LINE : 'line' ;
144PRAGMA : 'pragma' ;
145UNDEF : 'undef' ;
146
147/****** Punctuators: C11 Sec. 6.4.6 ******/
148ELLIPSIS : '...' ;
149DOTDOT : '..' ;
150DOT : '.' ;
151AMPERSAND : '&' ;
152AND : '&&' ;
153ARROW : '->' ;
154ASSIGN : '=' ;
155BITANDEQ : '&=' ;
156BITOR : '|' ;
157BITOREQ : '|=' ;
158BITXOR : '^' ;
159BITXOREQ : '^=' ;
160COLON : ':' ;
161COMMA : ',' ;
162DIV : '/' ;
163DIVEQ : '/=' ;
164EQUALS : '==' ;
165GT : '>' ;
166GTE : '>=' ;
167HASH : '#' | '%:' ;
168HASHHASH : '##' | '%:%:' ;
169LCURLY : '{' | '<%' ;
170LPAREN : '(' ;
171LSQUARE : '[' | '<:' ;
172LT : '<' ;
173LTE : '<=' ;
174MINUSMINUS : '--' ;
175MOD : '%' ;
176MODEQ : '%=' ;
177NEQ : '!=' ;
178NOT : '!' ;
179OR : '||' ;
180PLUS : '+' ;
181PLUSEQ : '+=' ;
182PLUSPLUS : '++' ;
183QMARK : '?' ;
184RCURLY : '}' | '%>' ;
185RPAREN : ')' ;
186RSQUARE : ']' | ':>' ;
187SEMI : ';' ;
188SHIFTLEFT : '<<' ;
189SHIFTLEFTEQ : '<<=' ;
190SHIFTRIGHT : '>>' ;
191SHIFTRIGHTEQ : '>>=' ;
192STAR : '*' ;
193STAREQ : '*=' ;
194SUB : '-' ;
195SUBEQ : '-=' ;
196TILDE : '~' ;
197
198/* CIVL-C and ACSL Punctuators */
199
200
201AT : '@' ;
202EQUIV_ACSL : '<==>' ;
203IMPLIES : '=>' ;
204IMPLIES_ACSL : '==>' ;
205// LSLIST and RSLIST enclose a scope list
206LSLIST : '<|' ;
207RSLIST : '|>' ;
208XOR_ACSL : '^^' ;
209
210/* CUDA Punctuators */
211LEXCON : '<<<' ;
212REXCON : '>>>' ;
213
214
215/****** Identifiers: C11 Sec. 6.4.2 ******/
216IDENTIFIER : IdentifierNonDigit
217 (IdentifierNonDigit | Digit)*
218 ;
219
220fragment
221IdentifierNonDigit
222 : NonDigit | UniversalCharacterName ;
223
224fragment
225Zero : '0' ;
226
227fragment
228Digit : Zero | NonZeroDigit ;
229
230fragment
231NonZeroDigit : '1' .. '9' ;
232
233fragment
234NonDigit : 'A'..'Z' | 'a'..'z' | '_' | '$';
235
236fragment
237UniversalCharacterName
238 : '\\' 'u' HexQuad
239 | '\\' 'U' HexQuad HexQuad
240 ;
241
242fragment
243HexQuad : HexadecimalDigit HexadecimalDigit HexadecimalDigit HexadecimalDigit ;
244
245fragment
246HexadecimalDigit
247 : '0'..'9' | 'a'..'f' | 'A'..'F' ;
248
249/****** Sec. 6.4.4.1: Integer constants ******/
250INTEGER_CONSTANT
251 : DecimalConstant IntegerSuffix?
252 | OctalConstant IntegerSuffix?
253 | HexadecimalConstant IntegerSuffix?
254 ;
255
256fragment
257DecimalConstant : NonZeroDigit Digit* ;
258
259
260fragment
261IntegerSuffix : UnsignedSuffix LongSuffix?
262 | UnsignedSuffix LongLongSuffix
263 | LongSuffix UnsignedSuffix?
264 | LongLongSuffix UnsignedSuffix?
265 ;
266
267fragment
268UnsignedSuffix : 'u' | 'U' ;
269
270fragment
271LongSuffix : 'l' | 'L' ;
272
273fragment
274LongLongSuffix : 'll' | 'LL' ;
275
276fragment
277OctalConstant : Zero OctalDigit* IntegerSuffix? ;
278
279fragment
280HexadecimalConstant
281 : HexPrefix HexadecimalDigit+ IntegerSuffix? ;
282
283fragment
284HexPrefix : Zero ('x' | 'X') ;
285
286/****** Sec. 6.4.4.2: Floating Constants ******/
287
288FLOATING_CONSTANT
289 : DecimalFloatingConstant
290 | HexadecimalFloatingConstant
291 ;
292
293fragment
294DecimalFloatingConstant
295 : FractionalConstant ExponentPart? FloatingSuffix?
296 | Digit+ ExponentPart FloatingSuffix?
297 ;
298
299fragment
300FractionalConstant
301 : Digit* DOT Digit+
302 | Digit+ DOT
303 ;
304
305fragment
306ExponentPart : ('e' | 'E') ('+' | '-')? Digit+ ;
307
308fragment
309FloatingSuffix : 'f' | 'l' | 'F' | 'L' ;
310
311fragment
312HexadecimalFloatingConstant
313 : HexPrefix HexFractionalConstant BinaryExponentPart
314 FloatingSuffix?
315 | HexPrefix HexadecimalDigit+ BinaryExponentPart
316 FloatingSuffix?
317 ;
318
319fragment
320HexFractionalConstant
321 : HexadecimalDigit* DOT HexadecimalDigit+
322 | HexadecimalDigit+ DOT
323 ;
324
325fragment
326BinaryExponentPart
327 : ('p' | 'P') ('+' | '-')? Digit+ ;
328
329
330/****** Preprocessing Numbers: C11 Sec 6.4.8 ******/
331
332/* PP_NUMBER should be anything that doesn't match the previous
333 * rules but does match this one.
334 */
335PP_NUMBER : '.'? Digit
336 ( '.'
337 | IdentifierNonDigit
338 | Digit
339 | ('e' | 'E' | 'p' | 'P') ('+' | '-')
340 )*
341 ;
342
343
344/****** Sec. 6.4.4.4: Character Constants ******/
345
346CHARACTER_CONSTANT
347 : ('L' | 'U' | 'u')? '\'' CChar+ '\'' ;
348
349fragment
350CChar : ~('\'' | '\\' | '\n') | EscapeSequence ;
351
352fragment
353EscapeSequence : '\\' ( '\'' | '"' | '\?' | '\\' |
354 'a' | 'b' | 'f' | 'n' |'r' | 't' | 'v'
355 )
356 | OctalEscape
357 | HexEscape
358 ;
359fragment
360OctalEscape : '\\' OctalDigit (OctalDigit OctalDigit?)? ;
361
362fragment
363OctalDigit : '0' .. '7';
364
365fragment
366HexEscape : '\\' 'x' HexadecimalDigit+ ;
367
368
369/****** 6.4.5: String Literals *****/
370
371
372STRING_LITERAL : ('u8' | 'u' | 'U' | 'L')? '"' SChar* '"'
373 ;
374
375fragment
376SChar : ~('"' | '\\' | '\n') | EscapeSequence ;
377
378
379
380/* ***** Comments: C11 Sec 6.4.9 ******/
381
382fragment
383INLINE_COMMENT : '//' INLINE_COMMENT_TAIL ;
384
385fragment
386INLINE_COMMENT_TAIL
387 : NEWLINE
388 | EOF
389 | ~('@' | '\n' | '\r') ( options {greedy=true;} : ~('\n'|'\r') )*
390 | {!parseAnnotations}?=> '@' ( options {greedy=true;} : ~('\n'|'\r') )*
391 | {parseAnnotations}?=> '@'
392 ;
393
394// the following rule is never activated but no problem, we capture the token
395// in INLINE_COMMENT and then change the token type in emit()...
396INLINE_ANNOTATION_START : '//@' ;
397
398// the following is not quite perfect because in the case of the \n or \r
399// immediately following the // it counts that white space as part of the
400// comment, otherwise it doesn't. Would like to make the \n or \r NOT
401// part of the comment always, but how --- need to look ahead one character?
402
403fragment
404BLOCK_COMMENT : '/*' BLOCK_COMMENT_TAIL ;
405
406fragment BLOCK_COMMENT_TAIL
407 : '*/'
408 | ~('@') ( options {greedy=false;} : . )* '*/'
409 | {!parseAnnotations}?=> '@' ( options {greedy=false;} : . )* '*/'
410 | {parseAnnotations}?=> '@'
411 ;
412
413COMMENT : INLINE_COMMENT | BLOCK_COMMENT ;
414
415// For some reason, ANNNOTATION_START is never invoked. No problem,
416// we will catch it on emit as a COMMENT and change its type.
417ANNOTATION_START : {parseAnnotations}?=> '/*' '@' ;
418ANNOTATION_END : {parseAnnotations}?=> '*/' ;
419
420
421/* Special keywords starting with backslash reserved for extensions
422 * such as ACSL */
423EXTENDED_IDENTIFIER
424 :
425 '\\' IdentifierNonDigit (IdentifierNonDigit | Digit)*
426 ;
427
428/****** Other characters: C11 Sec. 6.4 ******/
429OTHER : . ;
Note: See TracBrowser for help on using the repository browser.