Introduced token annotations

This commit is contained in:
Roman D. Boiko 2012-04-28 19:30:01 +03:00
parent 2a509b158f
commit a5089e978e
2 changed files with 63 additions and 7 deletions

View File

@ -266,7 +266,7 @@ enum TokenType: uint
UnsignedIntLiteral, /// 123u UnsignedIntLiteral, /// 123u
UnsignedLongLiteral, /// 123uL UnsignedLongLiteral, /// 123uL
NUMBERS_END, NUMBERS_END,
STRINGS_BEGIN, STRINGS_BEGIN, // note: alternative way to pass information about string postfix is to use TokenAnnotation
DStringLiteral, /// "32-bit character string"d DStringLiteral, /// "32-bit character string"d
StringLiteral, /// "a string" StringLiteral, /// "a string"
WStringLiteral, /// "16-bit character string"w WStringLiteral, /// "16-bit character string"w
@ -402,9 +402,38 @@ static this()
]; ];
} }
/**
* Token annotations are useful to pass meta information about tokens
*/
enum TokenAnnotation
{
None = 0x0, // no annotations by default
// validity
Invalid = 0x1, // token lexing failed
Unterminated = 0x2 | Invalid, // lexing failed because token has not been terminated correctly // todo: what could be other reasons to fail?
// character or string literals
TextLiteral = 0x4, // either character literal or string literal
SomeString = 0x8 | TextLiteral, // string, wstring or dstring literal (this annotation is used alone when string postfix is not specified)
SomeCharacter = 0x10 | TextLiteral, // char, wchar or dchar literal (depending on its value)
NarrowText = 0x20 | TextLiteral, // string or wstring, but not dstring; char or wchar, but not dchar
TextC = 0x40 | NarrowText, // string (c postfix) or char with value < 0x80 except EoL, EoF and escaped unicode literals starting from \u or \U
TextW = 0x80 | NarrowText, // wstring (w postfix) or wchar if value is in [0xE000..0xFFFE) U [0x80..0xD800) \ [0x2028..0x2029] or escaped Unicode literal \uXXXX
TextD = 0x100 | TextLiteral, // dstring (d postfix) or dchar (escaped Unicode literal \Uxxxxxxxx or any value which is not char or wchar)
WysiwygString = 0x200 | SomeString, // example usage: WysiwygString | TextC
AlternateWysiwygString = 0x400 | SomeString,
DoubleQuotedString = 0x800 | SomeString,
HexString = 0x1000 | SomeString,
// note: specification doesn't include postfix (c, w, d) for DelimitedString and TokenString, but DMD supports them
DelimitedString = 0x2000 | SomeString, // note: delimiter is included in token value along with double quotes
TokenString = 0x4000 | SomeString,
}
struct Token struct Token
{ {
TokenType type; TokenType type;
TokenAnnotation annotations;
string value; string value;
uint lineNumber; uint lineNumber;
size_t startIndex; size_t startIndex;

View File

@ -168,7 +168,7 @@ body
} }
/** /**
* Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF", * Lexes the various crazy D string literals such as q"{}", q"WTF is this? WTF",
* and q"<>". * and q"<>".
* Params: * Params:
* inputString = the source code to examine * inputString = the source code to examine
@ -228,16 +228,28 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
} }
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"') if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
++endIndex; ++endIndex;
// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
{
++endIndex; // todo: add token annotation according to postfix
}
return inputString[startIndex .. endIndex]; return inputString[startIndex .. endIndex];
} }
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
{ {
/+auto r = byDToken(range, IterationStyle.EVERYTHING); /+auto r = byDToken(range, IterationStyle.EVERYTHING);
string s = getBraceContent(r); string s = getBraceContent(r);
range.popFrontN(s.length); range.popFrontN(s.length);
return s;+/ return s;+/
//// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
//if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
// || inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
//{
// ++endIndex; // todo: add token annotation according to postfix
//}
return ""; return "";
} }
@ -562,7 +574,7 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
/** /**
* Returns: true if ch marks the ending of one token and the beginning of * Returns: true if ch marks the ending of one token and the beginning of
* another, false otherwise * another, false otherwise
*/ */
pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
@ -746,25 +758,38 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
currentToken.value = lexString(inputString, endIndex, currentToken.value = lexString(inputString, endIndex,
lineNumber, inputString[endIndex], false); lineNumber, inputString[endIndex], false);
currentToken.type = TokenType.StringLiteral; currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.WysiwygString;
break; break;
case '`': case '`':
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber, currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex], false); inputString[endIndex], false);
currentToken.type = TokenType.StringLiteral; currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.AlternateWysiwygString;
break; break;
case 'x': case 'x':
++endIndex; ++endIndex;
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
goto default; goto default;
else currentToken.lineNumber = lineNumber;
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex]); // todo: create lexHexString function
currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.HexString;
break;
case '\'': case '\'':
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex]); // todo: create dedicated function for lexing character literals
currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.SomeCharacter;
break;
case '"': case '"':
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber, currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex]); inputString[endIndex]);
currentToken.type = TokenType.StringLiteral; currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.DoubleQuotedString;
break; break;
case 'q': case 'q':
currentToken.value = "q"; currentToken.value = "q";
@ -778,12 +803,14 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
currentToken.value ~= lexDelimitedString(inputString, currentToken.value ~= lexDelimitedString(inputString,
endIndex, lineNumber); endIndex, lineNumber);
currentToken.type = TokenType.StringLiteral; currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.DelimitedString;
break outerSwitch; break outerSwitch;
case '{': case '{':
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
currentToken.value ~= lexTokenString(inputString, currentToken.value ~= lexTokenString(inputString,
endIndex, lineNumber); endIndex, lineNumber);
currentToken.type = TokenType.StringLiteral; currentToken.type = TokenType.StringLiteral;
currentToken.annotations |= TokenAnnotation.TokenString;
break outerSwitch; break outerSwitch;
default: default:
break; break;