Introduced token annotations

2012-04-28 19:30:01 +03:00 · 2012-04-28 19:30:01 +03:00 · a5089e978e
parent 2a509b158f
commit a5089e978e
2 changed files with 63 additions and 7 deletions
--- a/langutils.d
+++ b/langutils.d
@ -266,7 +266,7 @@ enum TokenType: uint
 	UnsignedIntLiteral, /// 123u
 	UnsignedLongLiteral, /// 123uL
 	NUMBERS_END,
-	STRINGS_BEGIN,
+	STRINGS_BEGIN, // note: alternative way to pass information about string postfix is to use TokenAnnotation
 	DStringLiteral, /// "32-bit character string"d
 	StringLiteral, /// "a string"
 	WStringLiteral, /// "16-bit character string"w
@ -402,9 +402,38 @@ static this()
 	];
 }

+/**
+ * Token annotations are useful to pass meta information about tokens
+ */
+enum TokenAnnotation
+{
+	None = 0x0, // no annotations by default
+
+	// validity
+	Invalid = 0x1, // token lexing failed
+	Unterminated = 0x2 | Invalid, // lexing failed because token has not been terminated correctly // todo: what could be other reasons to fail?
+
+	// character or string literals
+	TextLiteral = 0x4, // either character literal or string literal
+	SomeString = 0x8 | TextLiteral, // string, wstring or dstring literal (this annotation is used alone when string postfix is not specified)
+	SomeCharacter = 0x10 | TextLiteral, // char, wchar or dchar literal (depending on its value)
+	NarrowText = 0x20 | TextLiteral, // string or wstring, but not dstring; char or wchar, but not dchar
+	TextC = 0x40 | NarrowText, // string (c postfix) or char with value < 0x80 except EoL, EoF and escaped unicode literals starting from \u or \U
+	TextW = 0x80 | NarrowText, // wstring (w postfix) or wchar if value is in [0xE000..0xFFFE) U [0x80..0xD800) \ [0x2028..0x2029] or escaped Unicode literal \uXXXX
+	TextD = 0x100 | TextLiteral, // dstring (d postfix) or dchar (escaped Unicode literal \Uxxxxxxxx or any value which is not char or wchar)
+	WysiwygString = 0x200 | SomeString, // example usage: WysiwygString | TextC
+	AlternateWysiwygString = 0x400 | SomeString,
+	DoubleQuotedString = 0x800 | SomeString,
+	HexString = 0x1000 | SomeString,
+	// note: specification doesn't include postfix (c, w, d) for DelimitedString and TokenString, but DMD supports them
+	DelimitedString = 0x2000 | SomeString, // note: delimiter is included in token value along with double quotes
+	TokenString = 0x4000 | SomeString,
+}
+
 struct Token
 {
 	TokenType type;
+	TokenAnnotation annotations;
 	string value;
 	uint lineNumber;
 	size_t startIndex;
--- a/tokenizer.d
+++ b/tokenizer.d
@ -168,7 +168,7 @@ body
 }

 /**
- * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
+ * Lexes the various crazy D string literals such as q"{}", q"WTF is this? WTF",
 * and q"<>".
 * Params:
 *     inputString = the source code to examine
@ -228,16 +228,28 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
 	}
 	if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
 		++endIndex;
+	// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
+	if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
+		|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
+	{
+		++endIndex; // todo: add token annotation according to postfix
+	}
 	return inputString[startIndex .. endIndex];
 }

-
 string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
 {
 	/+auto r = byDToken(range, IterationStyle.EVERYTHING);
 	string s = getBraceContent(r);
 	range.popFrontN(s.length);
 	return s;+/
+
+	//// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
+	//if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
+	//	|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
+	//{
+	//	++endIndex; // todo: add token annotation according to postfix
+	//}
 	return "";
 }

@ -562,7 +574,7 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,


 /**
- * Returns: true if  ch marks the ending of one token and the beginning of
+ * Returns: true if ch marks the ending of one token and the beginning of
 *     another, false otherwise
 */
 pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
@ -746,25 +758,38 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			currentToken.value = lexString(inputString, endIndex,
 				lineNumber, inputString[endIndex], false);
 			currentToken.type = TokenType.StringLiteral;
+			currentToken.annotations |= TokenAnnotation.WysiwygString;
 			break;
 		case '`':
 			currentToken.lineNumber = lineNumber;
 			currentToken.value = lexString(inputString, endIndex, lineNumber,
 				inputString[endIndex], false);
 			currentToken.type = TokenType.StringLiteral;
+			currentToken.annotations |= TokenAnnotation.AlternateWysiwygString;
 			break;
-		case 'x':
+ 		case 'x':
 			++endIndex;
 			if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
 				goto default;
-			else
-				goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
+			currentToken.lineNumber = lineNumber;
+			currentToken.value = lexString(inputString, endIndex, lineNumber,
+				inputString[endIndex]); // todo: create lexHexString function
+			currentToken.type = TokenType.StringLiteral;
+			currentToken.annotations |= TokenAnnotation.HexString;
+			break;
 		case '\'':
+			currentToken.lineNumber = lineNumber;
+			currentToken.value = lexString(inputString, endIndex, lineNumber,
+				inputString[endIndex]); // todo: create dedicated function for lexing character literals
+			currentToken.type = TokenType.StringLiteral;
+			currentToken.annotations |= TokenAnnotation.SomeCharacter;
+			break;
 		case '"':
 			currentToken.lineNumber = lineNumber;
 			currentToken.value = lexString(inputString, endIndex, lineNumber,
 				inputString[endIndex]);
 			currentToken.type = TokenType.StringLiteral;
+			currentToken.annotations |= TokenAnnotation.DoubleQuotedString;
 			break;
 		case 'q':
 			currentToken.value = "q";
@ -778,12 +803,14 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 						currentToken.value ~= lexDelimitedString(inputString,
 							endIndex, lineNumber);
 						currentToken.type = TokenType.StringLiteral;
+						currentToken.annotations |= TokenAnnotation.DelimitedString;
 						break outerSwitch;
 					case '{':
 						currentToken.lineNumber = lineNumber;
 						currentToken.value ~= lexTokenString(inputString,
 							endIndex, lineNumber);
 						currentToken.type = TokenType.StringLiteral;
+						currentToken.annotations |= TokenAnnotation.TokenString;
 						break outerSwitch;
 					default:
 						break;