Context Navigation

PreprocessorLexer.g

main

Last change on this file was 3ce9d20, checked in by Stephen Siegel <siegel@…>, 19 months ago

Fixed a few preprocessing bugs, first dealing with the regular expression
needed to escape backslashes, second to control when annotations are scanned
as comments vs. ACSL.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5924 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 9.9 KB

Rev	Line
[aad342c]	1	lexer grammar PreprocessorLexer;
	2
	3	/*
	4	* Author: Stephen F. Siegel, University of Delaware
	5	* Last changed: June 2012
	6	*
	7	* This is a grammar for lexical analysis for a preprocessor
	8	* file. It follows the C11 Standard. This grammar assumes
	9	* that the stream of characters being scanned has already
	10	* gone through translation phases 1 and 2. In particular
	11	* backslash followed by newline sequences have been removed.
	12	*
	13	* This lexer grammar will not contain C keywords and ones for
	14	* CIVL-C, ACSL, GNU and CUDA extensions of C.
	15	* Those keywords are defined in
	16	* dev.civl.abc.front.c.parse.PP2CivlcTokenCConverter
	17	* A private function named as `initCKeywordMap` shall identify
[3ce9d20]	18	* and set tokens in preprocessed streams as its corresponding
[aad342c]	19	* token-types.
	20	*/
	21
	22	@header
	23	{
	24	package dev.civl.abc.front.c.preproc;
	25	}
	26
	27	@members
	28	{
[3ce9d20]	29
	30	/* Are we currently parsing ACSL annotations? If yes, the comments that
	31	begin with '@' will be parsed as a sequence of ordinary preprocessor
	32	tokens. If no, they will be parsed as ordinary comments, i.e., as a
	33	single token consisting of one big string. This option is controled
	34	by the presence of the #pragma CIVL ACSL in the source file. */
	35	public boolean parseAnnotations = false;
	36
	37	/* States in a DFS looking for "#pragma CIVL ACSL" which informs the
	38	lexer to start scanning annotations as preprocessor tokens rather
	39	than as one big comment. Start state: 1.
	40
	41	0: waiting for NEWLINE: anything other than NEWLINE self loops.
	42	on NEWLINE goto 1.
	43	1: waiting for #: whitespace self-loops,
	44	on # goto 2.
	45	anything else: goto 0.
	46	2: waiting for pragma: non-NEWLINE white space self-loops.
	47	on NEWLINE: goto 1.
	48	on pragma: goto 3.
	49	on anything else: goto 0
	50	3: waiting for CIVL: non-NEWLINE white space self-loops.
	51	on NEWLINE: goto 1.
	52	on CIVL: goto 4
	53	on anything else: goto 0
	54	4: waiting for ACSL: non-NEWLINE white space self-loops.
	55	on NEWLINE: goto 1
	56	on ACSL: BINGO. set parseAnnotations to true. Goto 0.
	57	on anything else: goto 0.
	58	*/
	59	private int annoteState = 1;
	60
[aad342c]	61	@Override
	62	public void emitErrorMessage(String msg) { // don't try to recover!
	63	throw new RuntimeException(msg);
	64	}
[3ce9d20]	65
	66	@Override
	67	public void emit(Token token) {
	68	if (parseAnnotations && token.getType() == COMMENT) {
	69	String text = token.getText();
	70	if ("/*@".equals(text))
	71	token.setType(ANNOTATION_START);
	72	else if ("//@".equals(text))
	73	token.setType(INLINE_ANNOTATION_START);
	74	}
	75	super.emit(token);
	76	//System.out.println("Token: "+token); // DEBUGGING....
	77	}
	78
	79	/* Looks for the sequence #pragma CIVL ACSL. As soon as that is detected
	80	sets parseAnnotations to true. This causes annotations to be parsed
	81	as preprocessor tokens rather than as text (as a normal comment would be).
	82	*/
	83	@Override
	84	public Token nextToken() {
	85	Token token = super.nextToken();
	86	if (parseAnnotations)
	87	return token;
	88	int type = token.getType();
	89	switch (annoteState) {
	90	case 0:
	91	if (type == NEWLINE) annoteState = 1;
	92	break;
	93	case 1: // at beginning of line. this is the start state.
	94	if (type == HASH) annoteState = 2;
	95	else if (type != NEWLINE && type != WS) annoteState = 0;
	96	break;
	97	case 2:
	98	if (type == NEWLINE) annoteState = 1;
	99	else if (type == PRAGMA) annoteState = 3;
	100	else if (type != WS) annoteState = 0;
	101	break;
	102	case 3:
	103	if (type == NEWLINE) annoteState = 1;
	104	else if (type == IDENTIFIER &&
	105	"CIVL".equals(token.getText().toUpperCase()))
	106	annoteState = 4;
	107	else if (type != WS) annoteState = 0;
	108	break;
	109	case 4:
	110	if (type == NEWLINE) annoteState = 1;
	111	else if (type == IDENTIFIER &&
	112	"ACSL".equals(token.getText().toUpperCase())) {
	113	parseAnnotations = true;
	114	//System.out.println("PARSING ANNOTATIONS NOW.");
	115	}
	116	else if (type != WS) annoteState = 0;
	117	break;
	118	default:
	119	assert false; // unreachable
	120	}
	121	return token;
	122	}
	123
[aad342c]	124	}
	125
	126	/**** White space ****/
	127	NEWLINE : '\r'? '\n' ;
	128	WS : ' ' \| '\t' ;
	129
	130	/* Words that are used in both C and the preprocessor */
	131	IF : 'if' ;
	132	ELSE : 'else' ;
	133
	134	/* Words used in preprocessor but not in C */
	135	DEFINE : 'define' ;
	136	DEFINED : 'defined' ;
	137	ELIF : 'elif' ;
	138	ENDIF : 'endif' ;
	139	ERROR : 'error' ;
	140	IFDEF : 'ifdef' ;
	141	IFNDEF : 'ifndef' ;
	142	INCLUDE : 'include' ;
	143	LINE : 'line' ;
	144	PRAGMA : 'pragma' ;
	145	UNDEF : 'undef' ;
	146
	147	/**** Punctuators: C11 Sec. 6.4.6 ****/
	148	ELLIPSIS : '...' ;
	149	DOTDOT : '..' ;
	150	DOT : '.' ;
	151	AMPERSAND : '&' ;
	152	AND : '&&' ;
	153	ARROW : '->' ;
	154	ASSIGN : '=' ;
	155	BITANDEQ : '&=' ;
	156	BITOR : '\|' ;
	157	BITOREQ : '\|=' ;
	158	BITXOR : '^' ;
	159	BITXOREQ : '^=' ;
	160	COLON : ':' ;
	161	COMMA : ',' ;
	162	DIV : '/' ;
	163	DIVEQ : '/=' ;
	164	EQUALS : '==' ;
	165	GT : '>' ;
	166	GTE : '>=' ;
	167	HASH : '#' \| '%:' ;
	168	HASHHASH : '##' \| '%:%:' ;
	169	LCURLY : '{' \| '<%' ;
	170	LPAREN : '(' ;
	171	LSQUARE : '[' \| '<:' ;
	172	LT : '<' ;
	173	LTE : '<=' ;
	174	MINUSMINUS : '--' ;
	175	MOD : '%' ;
	176	MODEQ : '%=' ;
	177	NEQ : '!=' ;
	178	NOT : '!' ;
	179	OR : '\|\|' ;
	180	PLUS : '+' ;
	181	PLUSEQ : '+=' ;
	182	PLUSPLUS : '++' ;
	183	QMARK : '?' ;
	184	RCURLY : '}' \| '%>' ;
	185	RPAREN : ')' ;
	186	RSQUARE : ']' \| ':>' ;
	187	SEMI : ';' ;
	188	SHIFTLEFT : '<<' ;
	189	SHIFTLEFTEQ : '<<=' ;
	190	SHIFTRIGHT : '>>' ;
	191	SHIFTRIGHTEQ : '>>=' ;
	192	STAR : '*' ;
	193	STAREQ : '*=' ;
	194	SUB : '-' ;
	195	SUBEQ : '-=' ;
	196	TILDE : '~' ;
	197
	198	/* CIVL-C and ACSL Punctuators */
[3ce9d20]	199
	200
[aad342c]	201	AT : '@' ;
	202	EQUIV_ACSL : '<==>' ;
	203	IMPLIES : '=>' ;
	204	IMPLIES_ACSL : '==>' ;
	205	// LSLIST and RSLIST enclose a scope list
	206	LSLIST : '<\|' ;
	207	RSLIST : '\|>' ;
	208	XOR_ACSL : '^^' ;
	209
	210	/* CUDA Punctuators */
	211	LEXCON : '<<<' ;
	212	REXCON : '>>>' ;
	213
[3ce9d20]	214
[aad342c]	215	/**** Identifiers: C11 Sec. 6.4.2 ****/
	216	IDENTIFIER : IdentifierNonDigit
	217	(IdentifierNonDigit \| Digit)*
	218	;
	219
	220	fragment
	221	IdentifierNonDigit
	222	: NonDigit \| UniversalCharacterName ;
	223
	224	fragment
	225	Zero : '0' ;
	226
	227	fragment
	228	Digit : Zero \| NonZeroDigit ;
	229
	230	fragment
	231	NonZeroDigit : '1' .. '9' ;
	232
	233	fragment
	234	NonDigit : 'A'..'Z' \| 'a'..'z' \| '_' \| '$';
	235
	236	fragment
	237	UniversalCharacterName
	238	: '\\' 'u' HexQuad
	239	\| '\\' 'U' HexQuad HexQuad
	240	;
	241
	242	fragment
	243	HexQuad : HexadecimalDigit HexadecimalDigit HexadecimalDigit HexadecimalDigit ;
	244
	245	fragment
	246	HexadecimalDigit
	247	: '0'..'9' \| 'a'..'f' \| 'A'..'F' ;
	248
	249	/**** Sec. 6.4.4.1: Integer constants ****/
	250	INTEGER_CONSTANT
	251	: DecimalConstant IntegerSuffix?
	252	\| OctalConstant IntegerSuffix?
	253	\| HexadecimalConstant IntegerSuffix?
	254	;
	255
	256	fragment
	257	DecimalConstant : NonZeroDigit Digit* ;
	258
	259
	260	fragment
	261	IntegerSuffix : UnsignedSuffix LongSuffix?
	262	\| UnsignedSuffix LongLongSuffix
	263	\| LongSuffix UnsignedSuffix?
	264	\| LongLongSuffix UnsignedSuffix?
	265	;
	266
	267	fragment
	268	UnsignedSuffix : 'u' \| 'U' ;
	269
	270	fragment
	271	LongSuffix : 'l' \| 'L' ;
	272
	273	fragment
	274	LongLongSuffix : 'll' \| 'LL' ;
	275
	276	fragment
	277	OctalConstant : Zero OctalDigit* IntegerSuffix? ;
	278
	279	fragment
	280	HexadecimalConstant
	281	: HexPrefix HexadecimalDigit+ IntegerSuffix? ;
	282
	283	fragment
	284	HexPrefix : Zero ('x' \| 'X') ;
	285
	286	/**** Sec. 6.4.4.2: Floating Constants ****/
	287
	288	FLOATING_CONSTANT
	289	: DecimalFloatingConstant
	290	\| HexadecimalFloatingConstant
	291	;
	292
	293	fragment
	294	DecimalFloatingConstant
	295	: FractionalConstant ExponentPart? FloatingSuffix?
	296	\| Digit+ ExponentPart FloatingSuffix?
	297	;
	298
	299	fragment
	300	FractionalConstant
	301	: Digit* DOT Digit+
	302	\| Digit+ DOT
	303	;
	304
	305	fragment
	306	ExponentPart : ('e' \| 'E') ('+' \| '-')? Digit+ ;
	307
	308	fragment
	309	FloatingSuffix : 'f' \| 'l' \| 'F' \| 'L' ;
	310
	311	fragment
	312	HexadecimalFloatingConstant
	313	: HexPrefix HexFractionalConstant BinaryExponentPart
	314	FloatingSuffix?
	315	\| HexPrefix HexadecimalDigit+ BinaryExponentPart
	316	FloatingSuffix?
	317	;
	318
	319	fragment
	320	HexFractionalConstant
	321	: HexadecimalDigit* DOT HexadecimalDigit+
	322	\| HexadecimalDigit+ DOT
	323	;
	324
	325	fragment
	326	BinaryExponentPart
	327	: ('p' \| 'P') ('+' \| '-')? Digit+ ;
	328
	329
	330	/**** Preprocessing Numbers: C11 Sec 6.4.8 ****/
	331
	332	/* PP_NUMBER should be anything that doesn't match the previous
	333	* rules but does match this one.
	334	*/
	335	PP_NUMBER : '.'? Digit
	336	( '.'
	337	\| IdentifierNonDigit
	338	\| Digit
	339	\| ('e' \| 'E' \| 'p' \| 'P') ('+' \| '-')
	340	)*
	341	;
	342
	343
	344	/**** Sec. 6.4.4.4: Character Constants ****/
	345
	346	CHARACTER_CONSTANT
	347	: ('L' \| 'U' \| 'u')? '\'' CChar+ '\'' ;
	348
	349	fragment
	350	CChar : ~('\'' \| '\\' \| '\n') \| EscapeSequence ;
	351
	352	fragment
	353	EscapeSequence : '\\' ( '\'' \| '"' \| '\?' \| '\\' \|
	354	'a' \| 'b' \| 'f' \| 'n' \|'r' \| 't' \| 'v'
	355	)
	356	\| OctalEscape
	357	\| HexEscape
	358	;
	359	fragment
	360	OctalEscape : '\\' OctalDigit (OctalDigit OctalDigit?)? ;
	361
	362	fragment
	363	OctalDigit : '0' .. '7';
	364
	365	fragment
	366	HexEscape : '\\' 'x' HexadecimalDigit+ ;
	367
	368
	369	/**** 6.4.5: String Literals ***/
	370
	371
	372	STRING_LITERAL : ('u8' \| 'u' \| 'U' \| 'L')? '"' SChar* '"'
	373	;
	374
	375	fragment
	376	SChar : ~('"' \| '\\' \| '\n') \| EscapeSequence ;
	377
	378
	379
	380	/* *** Comments: C11 Sec 6.4.9 ****/
	381
[3ce9d20]	382	fragment
	383	INLINE_COMMENT : '//' INLINE_COMMENT_TAIL ;
	384
	385	fragment
	386	INLINE_COMMENT_TAIL
	387	: NEWLINE
	388	\| EOF
	389	\| ~('@' \| '\n' \| '\r') ( options {greedy=true;} : ~('\n'\|'\r') )*
	390	\| {!parseAnnotations}?=> '@' ( options {greedy=true;} : ~('\n'\|'\r') )*
	391	\| {parseAnnotations}?=> '@'
	392	;
	393
	394	// the following rule is never activated but no problem, we capture the token
	395	// in INLINE_COMMENT and then change the token type in emit()...
	396	INLINE_ANNOTATION_START : '//@' ;
	397
[aad342c]	398	// the following is not quite perfect because in the case of the \n or \r
	399	// immediately following the // it counts that white space as part of the
	400	// comment, otherwise it doesn't. Would like to make the \n or \r NOT
	401	// part of the comment always, but how --- need to look ahead one character?
	402
	403	fragment
[3ce9d20]	404	BLOCK_COMMENT : '/*' BLOCK_COMMENT_TAIL ;
	405
	406	fragment BLOCK_COMMENT_TAIL
	407	: '*/'
	408	\| ~('@') ( options {greedy=false;} : . )* '*/'
	409	\| {!parseAnnotations}?=> '@' ( options {greedy=false;} : . )* '*/'
	410	\| {parseAnnotations}?=> '@'
	411	;
[aad342c]	412
	413	COMMENT : INLINE_COMMENT \| BLOCK_COMMENT ;
	414
[3ce9d20]	415	// For some reason, ANNNOTATION_START is never invoked. No problem,
	416	// we will catch it on emit as a COMMENT and change its type.
	417	ANNOTATION_START : {parseAnnotations}?=> '/*' '@' ;
	418	ANNOTATION_END : {parseAnnotations}?=> '*/' ;
	419
	420
[aad342c]	421	/* Special keywords starting with backslash reserved for extensions
	422	* such as ACSL */
	423	EXTENDED_IDENTIFIER
	424	:
	425	'\\' IdentifierNonDigit (IdentifierNonDigit \| Digit)*
	426	;
	427
	428	/**** Other characters: C11 Sec. 6.4 ****/
	429	OTHER : . ;

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format