Introduced token annotations
This commit is contained in:
parent
2a509b158f
commit
a5089e978e
31
langutils.d
31
langutils.d
|
|
@ -266,7 +266,7 @@ enum TokenType: uint
|
||||||
UnsignedIntLiteral, /// 123u
|
UnsignedIntLiteral, /// 123u
|
||||||
UnsignedLongLiteral, /// 123uL
|
UnsignedLongLiteral, /// 123uL
|
||||||
NUMBERS_END,
|
NUMBERS_END,
|
||||||
STRINGS_BEGIN,
|
STRINGS_BEGIN, // note: alternative way to pass information about string postfix is to use TokenAnnotation
|
||||||
DStringLiteral, /// "32-bit character string"d
|
DStringLiteral, /// "32-bit character string"d
|
||||||
StringLiteral, /// "a string"
|
StringLiteral, /// "a string"
|
||||||
WStringLiteral, /// "16-bit character string"w
|
WStringLiteral, /// "16-bit character string"w
|
||||||
|
|
@ -402,9 +402,38 @@ static this()
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Token annotations are useful to pass meta information about tokens
|
||||||
|
*/
|
||||||
|
enum TokenAnnotation
|
||||||
|
{
|
||||||
|
None = 0x0, // no annotations by default
|
||||||
|
|
||||||
|
// validity
|
||||||
|
Invalid = 0x1, // token lexing failed
|
||||||
|
Unterminated = 0x2 | Invalid, // lexing failed because token has not been terminated correctly // todo: what could be other reasons to fail?
|
||||||
|
|
||||||
|
// character or string literals
|
||||||
|
TextLiteral = 0x4, // either character literal or string literal
|
||||||
|
SomeString = 0x8 | TextLiteral, // string, wstring or dstring literal (this annotation is used alone when string postfix is not specified)
|
||||||
|
SomeCharacter = 0x10 | TextLiteral, // char, wchar or dchar literal (depending on its value)
|
||||||
|
NarrowText = 0x20 | TextLiteral, // string or wstring, but not dstring; char or wchar, but not dchar
|
||||||
|
TextC = 0x40 | NarrowText, // string (c postfix) or char with value < 0x80 except EoL, EoF and escaped unicode literals starting from \u or \U
|
||||||
|
TextW = 0x80 | NarrowText, // wstring (w postfix) or wchar if value is in [0xE000..0xFFFE) U [0x80..0xD800) \ [0x2028..0x2029] or escaped Unicode literal \uXXXX
|
||||||
|
TextD = 0x100 | TextLiteral, // dstring (d postfix) or dchar (escaped Unicode literal \Uxxxxxxxx or any value which is not char or wchar)
|
||||||
|
WysiwygString = 0x200 | SomeString, // example usage: WysiwygString | TextC
|
||||||
|
AlternateWysiwygString = 0x400 | SomeString,
|
||||||
|
DoubleQuotedString = 0x800 | SomeString,
|
||||||
|
HexString = 0x1000 | SomeString,
|
||||||
|
// note: specification doesn't include postfix (c, w, d) for DelimitedString and TokenString, but DMD supports them
|
||||||
|
DelimitedString = 0x2000 | SomeString, // note: delimiter is included in token value along with double quotes
|
||||||
|
TokenString = 0x4000 | SomeString,
|
||||||
|
}
|
||||||
|
|
||||||
struct Token
|
struct Token
|
||||||
{
|
{
|
||||||
TokenType type;
|
TokenType type;
|
||||||
|
TokenAnnotation annotations;
|
||||||
string value;
|
string value;
|
||||||
uint lineNumber;
|
uint lineNumber;
|
||||||
size_t startIndex;
|
size_t startIndex;
|
||||||
|
|
|
||||||
35
tokenizer.d
35
tokenizer.d
|
|
@ -168,7 +168,7 @@ body
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
|
* Lexes the various crazy D string literals such as q"{}", q"WTF is this? WTF",
|
||||||
* and q"<>".
|
* and q"<>".
|
||||||
* Params:
|
* Params:
|
||||||
* inputString = the source code to examine
|
* inputString = the source code to examine
|
||||||
|
|
@ -228,16 +228,28 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
|
||||||
}
|
}
|
||||||
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
|
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
|
||||||
++endIndex;
|
++endIndex;
|
||||||
|
// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
|
||||||
|
if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|
||||||
|
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
|
||||||
|
{
|
||||||
|
++endIndex; // todo: add token annotation according to postfix
|
||||||
|
}
|
||||||
return inputString[startIndex .. endIndex];
|
return inputString[startIndex .. endIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
|
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
|
||||||
{
|
{
|
||||||
/+auto r = byDToken(range, IterationStyle.EVERYTHING);
|
/+auto r = byDToken(range, IterationStyle.EVERYTHING);
|
||||||
string s = getBraceContent(r);
|
string s = getBraceContent(r);
|
||||||
range.popFrontN(s.length);
|
range.popFrontN(s.length);
|
||||||
return s;+/
|
return s;+/
|
||||||
|
|
||||||
|
//// note: specification doesn't mention postfixes for delimited and token string literals, but DMD supports them
|
||||||
|
//if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|
||||||
|
// || inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
|
||||||
|
//{
|
||||||
|
// ++endIndex; // todo: add token annotation according to postfix
|
||||||
|
//}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -746,25 +758,38 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
|
||||||
currentToken.value = lexString(inputString, endIndex,
|
currentToken.value = lexString(inputString, endIndex,
|
||||||
lineNumber, inputString[endIndex], false);
|
lineNumber, inputString[endIndex], false);
|
||||||
currentToken.type = TokenType.StringLiteral;
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.WysiwygString;
|
||||||
break;
|
break;
|
||||||
case '`':
|
case '`':
|
||||||
currentToken.lineNumber = lineNumber;
|
currentToken.lineNumber = lineNumber;
|
||||||
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
||||||
inputString[endIndex], false);
|
inputString[endIndex], false);
|
||||||
currentToken.type = TokenType.StringLiteral;
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.AlternateWysiwygString;
|
||||||
break;
|
break;
|
||||||
case 'x':
|
case 'x':
|
||||||
++endIndex;
|
++endIndex;
|
||||||
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
||||||
goto default;
|
goto default;
|
||||||
else
|
currentToken.lineNumber = lineNumber;
|
||||||
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
||||||
|
inputString[endIndex]); // todo: create lexHexString function
|
||||||
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.HexString;
|
||||||
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
|
currentToken.lineNumber = lineNumber;
|
||||||
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
||||||
|
inputString[endIndex]); // todo: create dedicated function for lexing character literals
|
||||||
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.SomeCharacter;
|
||||||
|
break;
|
||||||
case '"':
|
case '"':
|
||||||
currentToken.lineNumber = lineNumber;
|
currentToken.lineNumber = lineNumber;
|
||||||
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
||||||
inputString[endIndex]);
|
inputString[endIndex]);
|
||||||
currentToken.type = TokenType.StringLiteral;
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.DoubleQuotedString;
|
||||||
break;
|
break;
|
||||||
case 'q':
|
case 'q':
|
||||||
currentToken.value = "q";
|
currentToken.value = "q";
|
||||||
|
|
@ -778,12 +803,14 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
|
||||||
currentToken.value ~= lexDelimitedString(inputString,
|
currentToken.value ~= lexDelimitedString(inputString,
|
||||||
endIndex, lineNumber);
|
endIndex, lineNumber);
|
||||||
currentToken.type = TokenType.StringLiteral;
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.DelimitedString;
|
||||||
break outerSwitch;
|
break outerSwitch;
|
||||||
case '{':
|
case '{':
|
||||||
currentToken.lineNumber = lineNumber;
|
currentToken.lineNumber = lineNumber;
|
||||||
currentToken.value ~= lexTokenString(inputString,
|
currentToken.value ~= lexTokenString(inputString,
|
||||||
endIndex, lineNumber);
|
endIndex, lineNumber);
|
||||||
currentToken.type = TokenType.StringLiteral;
|
currentToken.type = TokenType.StringLiteral;
|
||||||
|
currentToken.annotations |= TokenAnnotation.TokenString;
|
||||||
break outerSwitch;
|
break outerSwitch;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue