Context Navigation

PreprocessorLexer.g

main

Last change on this file was 3ce9d20, checked in by Stephen Siegel <siegel@…>, 19 months ago

Fixed a few preprocessing bugs, first dealing with the regular expression
needed to escape backslashes, second to control when annotations are scanned
as comments vs. ACSL.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5924 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 9.9 KB

Line
1	lexer grammar PreprocessorLexer;
2
3	/*
4	* Author: Stephen F. Siegel, University of Delaware
5	* Last changed: June 2012
6	*
7	* This is a grammar for lexical analysis for a preprocessor
8	* file. It follows the C11 Standard. This grammar assumes
9	* that the stream of characters being scanned has already
10	* gone through translation phases 1 and 2. In particular
11	* backslash followed by newline sequences have been removed.
12	*
13	* This lexer grammar will not contain C keywords and ones for
14	* CIVL-C, ACSL, GNU and CUDA extensions of C.
15	* Those keywords are defined in
16	* dev.civl.abc.front.c.parse.PP2CivlcTokenCConverter
17	* A private function named as `initCKeywordMap` shall identify
18	* and set tokens in preprocessed streams as its corresponding
19	* token-types.
20	*/
21
22	@header
23	{
24	package dev.civl.abc.front.c.preproc;
25	}
26
27	@members
28	{
29
30	/* Are we currently parsing ACSL annotations? If yes, the comments that
31	begin with '@' will be parsed as a sequence of ordinary preprocessor
32	tokens. If no, they will be parsed as ordinary comments, i.e., as a
33	single token consisting of one big string. This option is controled
34	by the presence of the #pragma CIVL ACSL in the source file. */
35	public boolean parseAnnotations = false;
36
37	/* States in a DFS looking for "#pragma CIVL ACSL" which informs the
38	lexer to start scanning annotations as preprocessor tokens rather
39	than as one big comment. Start state: 1.
40
41	0: waiting for NEWLINE: anything other than NEWLINE self loops.
42	on NEWLINE goto 1.
43	1: waiting for #: whitespace self-loops,
44	on # goto 2.
45	anything else: goto 0.
46	2: waiting for pragma: non-NEWLINE white space self-loops.
47	on NEWLINE: goto 1.
48	on pragma: goto 3.
49	on anything else: goto 0
50	3: waiting for CIVL: non-NEWLINE white space self-loops.
51	on NEWLINE: goto 1.
52	on CIVL: goto 4
53	on anything else: goto 0
54	4: waiting for ACSL: non-NEWLINE white space self-loops.
55	on NEWLINE: goto 1
56	on ACSL: BINGO. set parseAnnotations to true. Goto 0.
57	on anything else: goto 0.
58	*/
59	private int annoteState = 1;
60
61	@Override
62	public void emitErrorMessage(String msg) { // don't try to recover!
63	throw new RuntimeException(msg);
64	}
65
66	@Override
67	public void emit(Token token) {
68	if (parseAnnotations && token.getType() == COMMENT) {
69	String text = token.getText();
70	if ("/*@".equals(text))
71	token.setType(ANNOTATION_START);
72	else if ("//@".equals(text))
73	token.setType(INLINE_ANNOTATION_START);
74	}
75	super.emit(token);
76	//System.out.println("Token: "+token); // DEBUGGING....
77	}
78
79	/* Looks for the sequence #pragma CIVL ACSL. As soon as that is detected
80	sets parseAnnotations to true. This causes annotations to be parsed
81	as preprocessor tokens rather than as text (as a normal comment would be).
82	*/
83	@Override
84	public Token nextToken() {
85	Token token = super.nextToken();
86	if (parseAnnotations)
87	return token;
88	int type = token.getType();
89	switch (annoteState) {
90	case 0:
91	if (type == NEWLINE) annoteState = 1;
92	break;
93	case 1: // at beginning of line. this is the start state.
94	if (type == HASH) annoteState = 2;
95	else if (type != NEWLINE && type != WS) annoteState = 0;
96	break;
97	case 2:
98	if (type == NEWLINE) annoteState = 1;
99	else if (type == PRAGMA) annoteState = 3;
100	else if (type != WS) annoteState = 0;
101	break;
102	case 3:
103	if (type == NEWLINE) annoteState = 1;
104	else if (type == IDENTIFIER &&
105	"CIVL".equals(token.getText().toUpperCase()))
106	annoteState = 4;
107	else if (type != WS) annoteState = 0;
108	break;
109	case 4:
110	if (type == NEWLINE) annoteState = 1;
111	else if (type == IDENTIFIER &&
112	"ACSL".equals(token.getText().toUpperCase())) {
113	parseAnnotations = true;
114	//System.out.println("PARSING ANNOTATIONS NOW.");
115	}
116	else if (type != WS) annoteState = 0;
117	break;
118	default:
119	assert false; // unreachable
120	}
121	return token;
122	}
123
124	}
125
126	/**** White space ****/
127	NEWLINE : '\r'? '\n' ;
128	WS : ' ' \| '\t' ;
129
130	/* Words that are used in both C and the preprocessor */
131	IF : 'if' ;
132	ELSE : 'else' ;
133
134	/* Words used in preprocessor but not in C */
135	DEFINE : 'define' ;
136	DEFINED : 'defined' ;
137	ELIF : 'elif' ;
138	ENDIF : 'endif' ;
139	ERROR : 'error' ;
140	IFDEF : 'ifdef' ;
141	IFNDEF : 'ifndef' ;
142	INCLUDE : 'include' ;
143	LINE : 'line' ;
144	PRAGMA : 'pragma' ;
145	UNDEF : 'undef' ;
146
147	/**** Punctuators: C11 Sec. 6.4.6 ****/
148	ELLIPSIS : '...' ;
149	DOTDOT : '..' ;
150	DOT : '.' ;
151	AMPERSAND : '&' ;
152	AND : '&&' ;
153	ARROW : '->' ;
154	ASSIGN : '=' ;
155	BITANDEQ : '&=' ;
156	BITOR : '\|' ;
157	BITOREQ : '\|=' ;
158	BITXOR : '^' ;
159	BITXOREQ : '^=' ;
160	COLON : ':' ;
161	COMMA : ',' ;
162	DIV : '/' ;
163	DIVEQ : '/=' ;
164	EQUALS : '==' ;
165	GT : '>' ;
166	GTE : '>=' ;
167	HASH : '#' \| '%:' ;
168	HASHHASH : '##' \| '%:%:' ;
169	LCURLY : '{' \| '<%' ;
170	LPAREN : '(' ;
171	LSQUARE : '[' \| '<:' ;
172	LT : '<' ;
173	LTE : '<=' ;
174	MINUSMINUS : '--' ;
175	MOD : '%' ;
176	MODEQ : '%=' ;
177	NEQ : '!=' ;
178	NOT : '!' ;
179	OR : '\|\|' ;
180	PLUS : '+' ;
181	PLUSEQ : '+=' ;
182	PLUSPLUS : '++' ;
183	QMARK : '?' ;
184	RCURLY : '}' \| '%>' ;
185	RPAREN : ')' ;
186	RSQUARE : ']' \| ':>' ;
187	SEMI : ';' ;
188	SHIFTLEFT : '<<' ;
189	SHIFTLEFTEQ : '<<=' ;
190	SHIFTRIGHT : '>>' ;
191	SHIFTRIGHTEQ : '>>=' ;
192	STAR : '*' ;
193	STAREQ : '*=' ;
194	SUB : '-' ;
195	SUBEQ : '-=' ;
196	TILDE : '~' ;
197
198	/* CIVL-C and ACSL Punctuators */
199
200
201	AT : '@' ;
202	EQUIV_ACSL : '<==>' ;
203	IMPLIES : '=>' ;
204	IMPLIES_ACSL : '==>' ;
205	// LSLIST and RSLIST enclose a scope list
206	LSLIST : '<\|' ;
207	RSLIST : '\|>' ;
208	XOR_ACSL : '^^' ;
209
210	/* CUDA Punctuators */
211	LEXCON : '<<<' ;
212	REXCON : '>>>' ;
213
214
215	/**** Identifiers: C11 Sec. 6.4.2 ****/
216	IDENTIFIER : IdentifierNonDigit
217	(IdentifierNonDigit \| Digit)*
218	;
219
220	fragment
221	IdentifierNonDigit
222	: NonDigit \| UniversalCharacterName ;
223
224	fragment
225	Zero : '0' ;
226
227	fragment
228	Digit : Zero \| NonZeroDigit ;
229
230	fragment
231	NonZeroDigit : '1' .. '9' ;
232
233	fragment
234	NonDigit : 'A'..'Z' \| 'a'..'z' \| '_' \| '$';
235
236	fragment
237	UniversalCharacterName
238	: '\\' 'u' HexQuad
239	\| '\\' 'U' HexQuad HexQuad
240	;
241
242	fragment
243	HexQuad : HexadecimalDigit HexadecimalDigit HexadecimalDigit HexadecimalDigit ;
244
245	fragment
246	HexadecimalDigit
247	: '0'..'9' \| 'a'..'f' \| 'A'..'F' ;
248
249	/**** Sec. 6.4.4.1: Integer constants ****/
250	INTEGER_CONSTANT
251	: DecimalConstant IntegerSuffix?
252	\| OctalConstant IntegerSuffix?
253	\| HexadecimalConstant IntegerSuffix?
254	;
255
256	fragment
257	DecimalConstant : NonZeroDigit Digit* ;
258
259
260	fragment
261	IntegerSuffix : UnsignedSuffix LongSuffix?
262	\| UnsignedSuffix LongLongSuffix
263	\| LongSuffix UnsignedSuffix?
264	\| LongLongSuffix UnsignedSuffix?
265	;
266
267	fragment
268	UnsignedSuffix : 'u' \| 'U' ;
269
270	fragment
271	LongSuffix : 'l' \| 'L' ;
272
273	fragment
274	LongLongSuffix : 'll' \| 'LL' ;
275
276	fragment
277	OctalConstant : Zero OctalDigit* IntegerSuffix? ;
278
279	fragment
280	HexadecimalConstant
281	: HexPrefix HexadecimalDigit+ IntegerSuffix? ;
282
283	fragment
284	HexPrefix : Zero ('x' \| 'X') ;
285
286	/**** Sec. 6.4.4.2: Floating Constants ****/
287
288	FLOATING_CONSTANT
289	: DecimalFloatingConstant
290	\| HexadecimalFloatingConstant
291	;
292
293	fragment
294	DecimalFloatingConstant
295	: FractionalConstant ExponentPart? FloatingSuffix?
296	\| Digit+ ExponentPart FloatingSuffix?
297	;
298
299	fragment
300	FractionalConstant
301	: Digit* DOT Digit+
302	\| Digit+ DOT
303	;
304
305	fragment
306	ExponentPart : ('e' \| 'E') ('+' \| '-')? Digit+ ;
307
308	fragment
309	FloatingSuffix : 'f' \| 'l' \| 'F' \| 'L' ;
310
311	fragment
312	HexadecimalFloatingConstant
313	: HexPrefix HexFractionalConstant BinaryExponentPart
314	FloatingSuffix?
315	\| HexPrefix HexadecimalDigit+ BinaryExponentPart
316	FloatingSuffix?
317	;
318
319	fragment
320	HexFractionalConstant
321	: HexadecimalDigit* DOT HexadecimalDigit+
322	\| HexadecimalDigit+ DOT
323	;
324
325	fragment
326	BinaryExponentPart
327	: ('p' \| 'P') ('+' \| '-')? Digit+ ;
328
329
330	/**** Preprocessing Numbers: C11 Sec 6.4.8 ****/
331
332	/* PP_NUMBER should be anything that doesn't match the previous
333	* rules but does match this one.
334	*/
335	PP_NUMBER : '.'? Digit
336	( '.'
337	\| IdentifierNonDigit
338	\| Digit
339	\| ('e' \| 'E' \| 'p' \| 'P') ('+' \| '-')
340	)*
341	;
342
343
344	/**** Sec. 6.4.4.4: Character Constants ****/
345
346	CHARACTER_CONSTANT
347	: ('L' \| 'U' \| 'u')? '\'' CChar+ '\'' ;
348
349	fragment
350	CChar : ~('\'' \| '\\' \| '\n') \| EscapeSequence ;
351
352	fragment
353	EscapeSequence : '\\' ( '\'' \| '"' \| '\?' \| '\\' \|
354	'a' \| 'b' \| 'f' \| 'n' \|'r' \| 't' \| 'v'
355	)
356	\| OctalEscape
357	\| HexEscape
358	;
359	fragment
360	OctalEscape : '\\' OctalDigit (OctalDigit OctalDigit?)? ;
361
362	fragment
363	OctalDigit : '0' .. '7';
364
365	fragment
366	HexEscape : '\\' 'x' HexadecimalDigit+ ;
367
368
369	/**** 6.4.5: String Literals ***/
370
371
372	STRING_LITERAL : ('u8' \| 'u' \| 'U' \| 'L')? '"' SChar* '"'
373	;
374
375	fragment
376	SChar : ~('"' \| '\\' \| '\n') \| EscapeSequence ;
377
378
379
380	/* *** Comments: C11 Sec 6.4.9 ****/
381
382	fragment
383	INLINE_COMMENT : '//' INLINE_COMMENT_TAIL ;
384
385	fragment
386	INLINE_COMMENT_TAIL
387	: NEWLINE
388	\| EOF
389	\| ~('@' \| '\n' \| '\r') ( options {greedy=true;} : ~('\n'\|'\r') )*
390	\| {!parseAnnotations}?=> '@' ( options {greedy=true;} : ~('\n'\|'\r') )*
391	\| {parseAnnotations}?=> '@'
392	;
393
394	// the following rule is never activated but no problem, we capture the token
395	// in INLINE_COMMENT and then change the token type in emit()...
396	INLINE_ANNOTATION_START : '//@' ;
397
398	// the following is not quite perfect because in the case of the \n or \r
399	// immediately following the // it counts that white space as part of the
400	// comment, otherwise it doesn't. Would like to make the \n or \r NOT
401	// part of the comment always, but how --- need to look ahead one character?
402
403	fragment
404	BLOCK_COMMENT : '/*' BLOCK_COMMENT_TAIL ;
405
406	fragment BLOCK_COMMENT_TAIL
407	: '*/'
408	\| ~('@') ( options {greedy=false;} : . )* '*/'
409	\| {!parseAnnotations}?=> '@' ( options {greedy=false;} : . )* '*/'
410	\| {parseAnnotations}?=> '@'
411	;
412
413	COMMENT : INLINE_COMMENT \| BLOCK_COMMENT ;
414
415	// For some reason, ANNNOTATION_START is never invoked. No problem,
416	// we will catch it on emit as a COMMENT and change its type.
417	ANNOTATION_START : {parseAnnotations}?=> '/*' '@' ;
418	ANNOTATION_END : {parseAnnotations}?=> '*/' ;
419
420
421	/* Special keywords starting with backslash reserved for extensions
422	* such as ACSL */
423	EXTENDED_IDENTIFIER
424	:
425	'\\' IdentifierNonDigit (IdentifierNonDigit \| Digit)*
426	;
427
428	/**** Other characters: C11 Sec. 6.4 ****/
429	OTHER : . ;

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format