CommonCharacterFactory.java

package edu.udel.cis.vsl.abc.token.common;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import edu.udel.cis.vsl.abc.token.IF.CivlcToken;
import edu.udel.cis.vsl.abc.token.IF.CivlcToken.TokenVocabulary;
import edu.udel.cis.vsl.abc.token.IF.CharacterToken;
import edu.udel.cis.vsl.abc.token.IF.ExecutionCharacter;
import edu.udel.cis.vsl.abc.token.IF.SyntaxException;
import edu.udel.cis.vsl.abc.token.IF.TokenFactory;
import edu.udel.cis.vsl.abc.token.IF.UnsourcedException;
import edu.udel.cis.vsl.abc.token.IF.ExecutionCharacter.CharacterKind;
import edu.udel.cis.vsl.abc.token.IF.StringLiteral.StringKind;

/**
 * A class to help parsing of characters and string and creation of
 * corresponding tokens.
 * 
 * @author siegel
 * 
 */
public class CommonCharacterFactory {

	// Types...

	/**
	 * A type which extracts the "core" part of the text of a string literal
	 * token (the text minus the optional prefix and double quotes) and which
	 * records the kind of string (which is determined by the prefix).
	 * 
	 * @author siegel
	 */
	private class StringAnalysis {
		CivlcToken token;
		StringKind kind;
		String core;

		StringAnalysis(CivlcToken token, StringKind kind, String core) {
			this.token = token;
			this.kind = kind;
			this.core = core;
		}
	}

	/**
	 * A type used to help parse the characters from a string, one at a time.
	 */
	private class CharacterParseData {
		String string;
		int totalLength;
		CharacterKind kind;
		int index;

		CharacterParseData(String string, CharacterKind kind) {
			this.string = string;
			this.kind = kind;
			this.index = 0;
			this.totalLength = string.length();
		}
	}

	// Fields...

	/**
	 * The tokenFactory which this class is helping.
	 */
	private TokenFactory tokenFactory;

	/**
	 * A map used to implement the Flyweight Patten on ExecutionCharacters.
	 */
	private Map<ExecutionCharacter, ExecutionCharacter> characterMap = new HashMap<ExecutionCharacter, ExecutionCharacter>();

	// Constructors...

	/**
	 * Constructs new CommonCharacterFactory associated to given tokenFactory.
	 * 
	 * @param tokenFactory
	 *                         any TokenFactory
	 */
	public CommonCharacterFactory(TokenFactory tokenFactory) {
		this.tokenFactory = tokenFactory;
	}

	// Exported methods...

	/**
	 * Returns canonic instance of execution character with given parameters.
	 * 
	 * @param kind
	 *                       the character kind
	 * @param codePoint
	 *                       the Unicode code point for this character
	 * @param characters
	 *                       the (1 or 2) Java character(s) representing this
	 *                       code point
	 * @return the execution character specified by the given 3 parameters
	 */
	public ExecutionCharacter executionCharacter(CharacterKind kind,
			int codePoint, char[] characters) {
		ExecutionCharacter result = new CommonExecutionCharacter(kind,
				codePoint, characters);
		ExecutionCharacter old = characterMap.get(result);

		if (old == null) {
			characterMap.put(result, result);
			return result;
		}
		return old;
	}

	public ExecutionCharacter nullChar(CharacterKind kind) {
		return executionCharacter(kind, 0, new char[]{'\0'});
	}

	/**
	 * Given a CToken of type CHARACTER_CONSTANT, this returns an instance of
	 * CommonCharacterToken formed by analyzing the given token. The text from
	 * the given token is analyzed to extract its (optional) prefix, used to
	 * determine the "character kind", escape sequences are analyzed, and so on.
	 * 
	 * @param token
	 *                  a token of type PreprocessorParser.CHARACTER_CONSTANT;
	 * @return a CharacterToken based on data in the given token
	 * @throws SyntaxException
	 *                             if the text of the character constant is not
	 *                             well formed
	 */
	public CharacterToken characterToken(CivlcToken token)
			throws SyntaxException {
		String filename = token.getSourceFile().getName();
		String text = token.getText();
		Character prefix = text.charAt(0);
		CharacterKind kind;
		String stripped;
		int length;
		ExecutionCharacter executionCharacter;
		CharacterParseData data;

		if (prefix == 'L' || prefix == 'u' || prefix == 'U') {
			stripped = text.substring(1);
			if (prefix == 'L')
				kind = CharacterKind.WCHAR;
			else if (prefix == 'u')
				kind = CharacterKind.CHAR16;
			else if (prefix == 'U')
				kind = CharacterKind.CHAR32;
			else
				throw new RuntimeException("unreachable");
		} else {
			kind = CharacterKind.CHAR;
			stripped = text;
		}
		length = stripped.length();
		if (length < 3 || stripped.charAt(0) != '\''
				|| stripped.charAt(length - 1) != '\'')
			throw tokenFactory.newSyntaxException(
					"Malformed character constant: " + stripped, token);
		stripped = stripped.substring(1, length - 1);
		length -= 2;
		data = new CharacterParseData(stripped, kind);
		if (filename.toUpperCase().contains(".F")) {
			// Fortran uses ' for wrapping a sequence of chars as a string.
			return new CommonCharacterToken(token,
					executionCharacter(data.kind, -1, stripped.toCharArray()));
		}
		try {
			executionCharacter = parseNextCharacter(data);
		} catch (UnsourcedException e) {
			throw tokenFactory
					.newSyntaxException("Malformed character constant", token);
		}
		if (data.index != data.totalLength)
			throw tokenFactory
					.newSyntaxException("Malformed character constant", token);
		return new CommonCharacterToken(token, executionCharacter);
	}

	/**
	 * Given a CToken of type STRING_LITERAL, this method analyzes the text of
	 * the string literal and uses the result to create a StringLiteral object.
	 * The StringLiteral object stores the "string kind" of the string literal
	 * (determined by the optional prefix in the text) and the sequence of
	 * execution characters specified by the core part of the test.
	 * 
	 * The null character is appended to the execution character sequence.
	 * 
	 * @param token
	 *                  a token of type STRING_LITERAL
	 * @return a StringLiteral object with kind and execution characters
	 *         determined by the text of the given token
	 * @throws SyntaxException
	 *                             if the text of given token does not have the
	 *                             proper syntax for a string literal
	 */
	public CommonStringLiteral stringLiteral(CivlcToken token)
			throws SyntaxException {
		StringAnalysis analysis = stringAnalyze(token);
		StringKind stringKind = analysis.kind;
		CharacterKind characterKind = characterKind(stringKind);
		ArrayList<ExecutionCharacter> characters = new ArrayList<ExecutionCharacter>();

		try {
			extractCharacters(characterKind, analysis.core, characters);
		} catch (UnsourcedException e) {
			throw tokenFactory.newSyntaxException(e, token);
		}
		characters.add(nullChar(characterKind));
		return new CommonStringLiteral(stringKind, characters);
	}

	/**
	 * Given a sequence of adjacent tokens, all of type STRING_LITERAL, this
	 * forms the StringLiteral object obtained by (1) analyzing each token to
	 * extract a sequence of execution characters, and (2) then concatenating
	 * the sequences to form one big sequence of execution characters. The
	 * string kind of the result is determined by the kinds of the constituents
	 * as specified in the C11 Standard.
	 * 
	 * The null character is appended to the final execution character sequence.
	 * 
	 * @param tokens
	 *                   a list of tokens, each of type STRING_LITERAL
	 * @return a StringLiteral object representing the concatenation of the
	 *         StringLiteral objects resulting from each token
	 * @throws SyntaxException
	 *                             if any of the token texts are not proper
	 *                             string literals, or the string kinds do not
	 *                             match up
	 */
	public CommonStringLiteral stringLiteral(List<CivlcToken> tokens)
			throws SyntaxException {
		List<StringAnalysis> analyses = new LinkedList<StringAnalysis>();
		ArrayList<ExecutionCharacter> characters = new ArrayList<ExecutionCharacter>();
		StringKind stringKind;
		CharacterKind characterKind;

		for (CivlcToken token : tokens)
			analyses.add(stringAnalyze(token));
		stringKind = concatKind(analyses);
		characterKind = characterKind(stringKind);
		for (StringAnalysis analysis : analyses) {
			try {
				extractCharacters(characterKind, analysis.core, characters);
			} catch (UnsourcedException e) {
				throw tokenFactory.newSyntaxException(e, analysis.token);
			}
		}
		characters.add(nullChar(characterKind));
		return new CommonStringLiteral(stringKind, characters);
	}

	// Private methods...

	private void extractCharacters(CharacterKind kind, String text,
			ArrayList<ExecutionCharacter> list) throws UnsourcedException {
		CharacterParseData data = new CharacterParseData(text, kind);

		while (data.index < data.totalLength)
			list.add(parseNextCharacter(data));
	}

	private StringAnalysis stringAnalyze(CivlcToken token)
			throws SyntaxException {
		String text = token.getText();
		int length = text.length();
		char prefix0;
		int quotePos;
		StringKind kind;

		if (token.getTokenVocab() == TokenVocabulary.FORTRAN && //
				text.startsWith("'") && text.endsWith("'"))
			text = "\"" + text.substring(1, text.length() - 1) + "\"";
		if (length < 2)
			throw tokenFactory.newSyntaxException("Malformed string literal",
					token);
		if (text.charAt(length - 1) != '"')
			throw tokenFactory.newSyntaxException("Malformed string literal",
					token);
		prefix0 = text.charAt(0);
		switch (prefix0) {
			case '"' :
				kind = StringKind.CHAR;
				quotePos = 0;
				break;
			case 'L' :
				kind = StringKind.WCHAR;
				quotePos = 1;
				break;
			case 'U' :
				kind = StringKind.CHAR32;
				quotePos = 1;
				break;
			case 'u' : {
				char prefix1 = text.charAt(1);

				switch (prefix1) {
					case '"' :
						kind = StringKind.CHAR16;
						quotePos = 1;
						break;
					case '8' :
						kind = StringKind.UTF_8;
						quotePos = 2;
						break;
					default :
						throw tokenFactory.newSyntaxException(
								"Illegal string prefix", token);
				}
				break;
			}
			default :
				throw tokenFactory.newSyntaxException("Illegal string prefix",
						token);
		}
		if (length < quotePos + 2)
			throw tokenFactory.newSyntaxException("Malformed string literal",
					token);
		if (text.charAt(quotePos) != '"')
			throw tokenFactory.newSyntaxException(
					"String literal missing open quote", token);
		return new StringAnalysis(token, kind,
				text.substring(quotePos + 1, length - 1));
	}

	private StringKind concatKind(List<StringAnalysis> analyses)
			throws SyntaxException {
		StringKind kind = null;

		for (StringAnalysis analysis : analyses) {
			if (kind == null) {
				kind = analysis.kind;
			} else {
				StringKind newKind = analysis.kind;

				if (kind != newKind) {
					if (kind == StringKind.CHAR)
						kind = newKind;
					else if (newKind != StringKind.CHAR)
						throw tokenFactory.newSyntaxException(
								"Adjacent string literals have incompatible types: "
										+ kind + ", " + newKind,
								analysis.token);
				}
			}
		}
		return kind;
	}

	private CharacterKind characterKind(StringKind stringKind) {
		switch (stringKind) {
			case CHAR :
			case UTF_8 :
				return CharacterKind.CHAR;
			case WCHAR :
				return CharacterKind.WCHAR;
			case CHAR16 :
				return CharacterKind.CHAR16;
			case CHAR32 :
				return CharacterKind.CHAR32;
			default :
				throw new RuntimeException("unreachable");
		}
	}

	private boolean isHex(char c) {
		return '0' <= c && c <= '9' || 'a' <= c && c <= 'f'
				|| 'A' <= c && c <= 'F';
	}

	private boolean isOctal(char c) {
		return '0' <= c && c <= '7';
	}

	private ExecutionCharacter parseNextCharacter(CharacterParseData data)
			throws UnsourcedException {
		String text = data.string;
		int index = data.index;
		int totalLength = data.totalLength;
		char first = text.charAt(index);
		int codePoint;
		char[] characters;

		// cases: (1) single char (2) escape sequence (3) \ followed by
		// 1, 2, or 3 octal digits, or (4) \x followed by sequence
		// of hex digits. Read maximal substring that matches
		// one of those patterns.

		index++; // consumed first character
		if (first == '\\') {
			char second;

			if (index >= totalLength)
				throw tokenFactory.newUnsourcedException(
						"Malformed character constant: " + text);
			second = text.charAt(index);
			index++; // consume second
			if (second == 'x') { // hex: sequence of hex digits
				int start = index;
				String hexString;

				// consume all hex digits...
				while (index < totalLength && isHex(text.charAt(index)))
					index++;
				hexString = text.substring(start, index);
				try {
					codePoint = Integer.parseInt(hexString, 16);
				} catch (NumberFormatException e) {
					throw tokenFactory.newUnsourcedException(
							"Malformed character constant: " + text);
				}
				characters = Character.toChars(codePoint);
			} else if (isOctal(second)) { // octal: 1, 2, or 3 digits
				int start = index - 1;
				String octalString;

				if (index < totalLength && isOctal(text.charAt(index))) {
					index++; // consume second octal digit
					if (index < totalLength && isOctal(text.charAt(index)))
						index++; // consume third octal digit
				}
				octalString = text.substring(start, index);
				try {
					codePoint = Integer.parseInt(octalString, 8);
				} catch (NumberFormatException e) {
					throw tokenFactory.newUnsourcedException(
							"Malformed character constant: " + text);
				}
				characters = Character.toChars(codePoint);
			} else { // escape sequence
				characters = new char[1];
				switch (second) {
					case '\'' :
						characters[0] = '\'';
						break;
					case '"' :
						characters[0] = '"';
						break;
					case '?' :
						characters[0] = '?';
						break;
					case '\\' :
						characters[0] = '\\';
						break;
					case 'a' :
						characters[0] = '\007'; // alert
						break;
					case 'b' :
						characters[0] = '\b'; // backspace
						break;
					case 'f' :
						characters[0] = '\f'; // formfeed
						break;
					case 'n' :
						characters[0] = '\n'; // newline
						break;
					case 'r' :
						characters[0] = '\r'; // return
						break;
					case 't' :
						characters[0] = '\t'; // tab
						break;
					case 'v' :
						characters[0] = '\013'; // vertical tab
						break;
					default :
						throw tokenFactory.newUnsourcedException(
								"Unknown escape sequence in character");
				}
				codePoint = (int) characters[0];
			}
		} else {
			characters = new char[]{first};
			codePoint = (int) first;
		}
		data.index = index;
		return executionCharacter(data.kind, codePoint, characters);
	}

}