// Written in the D programming language /** * This module contains a range-based lexer for the D programming language. * * Examples: * * Generate HTML markup of D code. * --- * import std.stdio; * import std.array; * import std.file; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * *
* * * *]");
*
* foreach (Token t; tokens)
* {
* if (t.type > TokenType.TYPES_BEGIN && t.type < TokenType.TYPES_END)
* writeSpan("type", t.value);
* else if (t.type > TokenType.KEYWORDS_BEGIN && t.type < TokenType.KEYWORDS_END)
* writeSpan("kwrd", t.value);
* else if (t.type == TokenType.Comment)
* writeSpan("com", t.value);
* else if (t.type > TokenType.STRINGS_BEGIN && t.type < TokenType.STRINGS_END)
* writeSpan("str", t.value);
* else if (t.type > TokenType.NUMBERS_BEGIN && t.type < TokenType.NUMBERS_END)
* writeSpan("num", t.value);
* else if (t.type > TokenType.OPERATORS_BEGIN && t.type < TokenType.OPERATORS_END)
* writeSpan("op", t.value);
* else
* stdout.write(t.value.replace("<", "<"));
* }
* stdout.writeln("\n");
* }
*
* void main(string[] args)
* {
* args[1].readText().byToken(IterationStyle.Everything, StringStyle.Source).highlight();
* }
* ---
*
* Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott
* Source: $(PHOBOSSRC std/d/_lexer.d)
*/
module std.d.lexer;
import std.range;
import std.traits;
import std.algorithm;
import std.conv;
import std.uni;
import std.ascii;
import std.exception;
import std.d.entities;
public:
/**
* Represents a D token
*/
struct Token
{
/// The token type.
TokenType type;
/// The representation of the token in the original source code.
string value;
/// The number of the line the token is on.
uint lineNumber;
/// The character index of the start of the token in the original text.
uint startIndex;
/**
* Check to see if the token is of the same type and has the same string
* representation as the given token.
*/
bool opEquals(ref const(Token) other) const
{
return other.type == type && other.value == value;
}
/**
* Checks to see if the token's string representation is equal to the given
* string.
*/
bool opEquals(string value) const { return this.value == value; }
/**
* Checks to see if the token is of the given type.
*/
bool opEquals(TokenType type) const { return type == type; }
/**
* Comparison operator orders tokens by start index.
*/
int opCmp(size_t i) const
{
if (startIndex < i) return -1;
if (startIndex > i) return 1;
return 0;
}
}
/**
* Configure the behavior of the byToken() function
*/
enum IterationStyle
{
/// Only include code, not whitespace or comments
CodeOnly = 0,
/// Includes comments
IncludeComments = 0b0001,
/// Includes whitespace
IncludeWhitespace = 0b0010,
/// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
IncludeSpecialTokens = 0b0100,
/// Include everything
Everything = IncludeComments | IncludeWhitespace
}
/**
* Configuration of the string lexing style
*/
enum StringStyle : uint
{
/**
* Escape sequences will be replaced with their equivalent characters,
* enclosing quote characters will not be included. Useful for creating a
* compiler or interpreter.
*/
Default = 0b0000,
/**
* Escape sequences will not be processed. An escaped quote character will
* not terminate string lexing, but it will not be replaced with the quote
* character in the token.
*/
NotEscaped = 0b0001,
/**
* Strings will include their opening and closing quote characters as well
* as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
* include the $(D_STRING 'w') character as well as the opening and closing
* quotes$(RPAREN)
*/
IncludeQuotes = 0x0010,
/**
* Strings will be read exactly as they appeared in the source, including
* their opening and closing quote characters. Useful for syntax
* highlighting.
*/
Source = NotEscaped | IncludeQuotes,
}
/**
* Iterate over the given range of characters by D tokens.
* Params:
* range = the range of characters
* iterationStyle = See IterationStyle
* stringStyle = see StringStyle
* Returns:
* an input range of tokens
*/
TokenRange!(R) byToken(R)(R range, const IterationStyle iterationStyle = IterationStyle.CodeOnly,
const StringStyle stringStyle = StringStyle.Default) if (isForwardRange!(R) && isSomeChar!(ElementType!(R)))
{
auto r = new TokenRange!(R)(range);
r.stringStyle = stringStyle;
r.iterStyle = iterationStyle;
r.lineNumber = 1;
r.popFront();
return r;
}
/**
* Range of tokens. Avoid creating instances of this manually. Use
* $(DDOC_PSYMBOL byToken$(LPAREN)$(RPAREN)) instead, as it does some initialization work.
*/
class TokenRange(R) : InputRange!(Token)
{
this(ref R range)
{
this.range = range;
}
/**
* Returns: true if the range is empty
*/
override bool empty() const @property
{
return _empty;
}
/**
* Returns: the current token
*/
override Token front() const @property
{
enforce(!_empty, "Cannot call front() on empty token range");
return current;
}
/**
* Returns the current token and then removes it from the range
*/
override Token moveFront()
{
auto r = front();
popFront();
return r;
}
override int opApply(int delegate(Token) dg)
{
int result = 0;
while (!empty)
{
result = dg(front);
if (result)
break;
popFront();
}
return result;
}
override int opApply(int delegate(size_t, Token) dg)
{
int result = 0;
int i = 0;
while (!empty)
{
result = dg(i, front);
if (result)
break;
popFront();
}
return result;
}
override void popFront()
{
// Filter out tokens we don't care about
loop: do
{
advance();
switch (current.type)
{
case TokenType.Comment:
if (iterStyle & IterationStyle.IncludeComments)
break loop;
break;
case TokenType.Whitespace:
if (iterStyle & IterationStyle.IncludeWhitespace)
break loop;
break;
case TokenType.SpecialTokenSequence:
if (iterStyle & IterationStyle.IncludeSpecialTokens)
break loop;
break;
default:
break loop;
}
}
while (!empty());
}
private:
/*
* Advances the range to the next token
*/
void advance()
{
if (range.empty)
{
_empty = true;
return;
}
current = Token.init;
current.lineNumber = lineNumber;
current.startIndex = index;
if (std.uni.isWhite(range.front))
{
current = lexWhitespace(range, index, lineNumber);
return;
}
outer: switch (range.front)
{
mixin(generateCaseTrie(
"=", "TokenType.Assign",
"&", "TokenType.BitAnd",
"&=", "TokenType.BitAndEquals",
"|", "TokenType.BitOr",
"|=", "TokenType.BitOrEquals",
"~=", "TokenType.CatEquals",
":", "TokenType.Colon",
",", "TokenType.Comma",
"$", "TokenType.Dollar",
".", "TokenType.Dot",
"==", "TokenType.Equals",
"=>", "TokenType.GoesTo",
">", "TokenType.Greater",
">=", "TokenType.GreaterEqual",
"&&", "TokenType.LogicAnd",
"{", "TokenType.LBrace",
"[", "TokenType.LBracket",
"<", "TokenType.Less",
"<=", "TokenType.LessEqual",
"<>=", "TokenType.LessEqualGreater",
"<>", "TokenType.LessOrGreater",
"||", "TokenType.LogicOr",
"(", "TokenType.LParen",
"-", "TokenType.Minus",
"-=", "TokenType.MinusEquals",
"%", "TokenType.Mod",
"%=", "TokenType.ModEquals",
"*=", "TokenType.MulEquals",
"!", "TokenType.Not",
"!=", "TokenType.NotEquals",
"!>", "TokenType.NotGreater",
"!>=", "TokenType.NotGreaterEqual",
"!<", "TokenType.NotLess",
"!<=", "TokenType.NotLessEqual",
"!<>", "TokenType.NotLessEqualGreater",
"+", "TokenType.Plus",
"+=", "TokenType.PlusEquals",
"^^", "TokenType.Pow",
"^^=", "TokenType.PowEquals",
"}", "TokenType.RBrace",
"]", "TokenType.RBracket",
")", "TokenType.RParen",
";", "TokenType.Semicolon",
"<<", "TokenType.ShiftLeft",
"<<=", "TokenType.ShiftLeftEqual",
">>", "TokenType.ShiftRight",
">>=", "TokenType.ShiftRightEqual",
"..", "TokenType.Slice",
"*", "TokenType.Star",
"?", "TokenType.Ternary",
"~", "TokenType.Tilde",
"--", "TokenType.Decrement",
"!<>=", "TokenType.Unordered",
">>>", "TokenType.UnsignedShiftRight",
">>>=", "TokenType.UnsignedShiftRightEqual",
"++", "TokenType.Increment",
"...", "TokenType.Vararg",
"^", "TokenType.Xor",
"^=", "TokenType.XorEquals",
"@", "TokenType.At",
));
case '0': .. case '9':
current = lexNumber(range, index, lineNumber);
break;
case '\'':
case '"':
current = lexString(range, index, lineNumber, stringStyle);
break;
case '`':
current = lexString(range, index, lineNumber, stringStyle);
break;
case 'q':
auto r = range.save;
r.popFront();
if (!r.isEoF() && r.front == '{')
{
current = lexTokenString(range, index, lineNumber, stringStyle);
break;
}
else if (!r.isEoF() && r.front == '"')
{
current = lexDelimitedString(range, index, lineNumber,
stringStyle);
break;
}
else
goto default;
case '/':
auto r = range.save();
r.popFront();
if (r.isEoF())
{
current.type = TokenType.Div;
current.value = "/";
range.popFront();
++index;
break;
}
switch (r.front)
{
case '/':
case '*':
case '+':
current = lexComment(range, index, lineNumber);
break outer;
case '=':
current.type = TokenType.DivEquals;
current.value = "/=";
range.popFront();
range.popFront();
index += 2;
break outer;
default:
current.type = TokenType.Div;
current.value = "/";
++index;
range.popFront();
break outer;
}
case 'r':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
{
current = lexString(range, index, lineNumber, stringStyle);
break;
}
else
goto default;
case 'x':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
{
current = lexHexString(range, index, lineNumber);
break;
}
else
goto default;
case '#':
string special = lexSpecialTokenSequence(range, index, lineNumber);
if (special)
{
current.type = TokenType.SpecialTokenSequence;
current.value = special;
}
else
{
current.type = TokenType.Hash;
current.value = "#";
range.popFront();
++index;
break;
}
break;
default:
auto app = appender!(ElementType!(R)[])();
while(!range.isEoF() && !isSeparating(range.front))
{
app.put(range.front);
range.popFront();
++index;
}
current.value = to!string(app.data);
current.type = lookupTokenType(current.value);
break;
}
}
Token current;
uint lineNumber;
uint index;
R range;
bool _empty;
IterationStyle iterStyle;
StringStyle stringStyle;
}
unittest
{
import std.stdio;
auto a = "/**comment*/\n#lin #line 10 \"test.d\"\nint a;//test\n";
foreach (t; byToken(a))
writeln(t);
}
/**
* Listing of all the tokens in the D language.
*
* Token types are arranged so that it is easy to group tokens while iterating
* over them. For example:
* ---
* assert(TokenType.Increment < TokenType.OPERATORS_END);
* assert(TokenType.Increment > TokenType.OPERATORS_BEGIN);
* ---
* The non-token values are documented below:
*
* $(BOOKTABLE ,
* $(TR $(TH Begin) $(TH End) $(TH Content) $(TH Examples))
* $(TR $(TD OPERATORS_BEGIN) $(TD OPERATORS_END) $(TD operatiors) $(TD +, -, <<=))
* $(TR $(TD TYPES_BEGIN) $(TD TYPES_END) $(TD types) $(TD bool, char, double))
* $(TR $(TD KEYWORDS_BEGIN) $(TD KEYWORDS) $(TD keywords) $(TD class, if, assert))
* $(TR $(TD ATTRIBUTES_BEGIN) $(TD ATTRIBUTES_END) $(TD attributes) $(TD override synchronized, __gshared))
* $(TR $(TD ATTRIBUTES_BEGIN) $(TD ATTRIBUTES_END) $(TD protection) $(TD public, protected))
* $(TR $(TD CONSTANTS_BEGIN) $(TD CONSTANTS_END) $(TD compile-time constants) $(TD __FILE__, __TIME__))
* $(TR $(TD LITERALS_BEGIN) $(TD LITERALS_END) $(TD string and numeric literals) $(TD "str", 123))
* $(TR $(TD NUMBERS_BEGIN) $(TD NUMBERS_END) $(TD numeric literals) $(TD 0x123p+9, 0b0110))
* $(TR $(TD STRINGS_BEGIN) $(TD STRINGS_END) $(TD string literals) $(TD `123`c, q{tokens;}, "abcde"))
* $(TR $(TD MISC_BEGIN) $(TD MISC_END) $(TD anything else) $(TD whitespace, comments, identifiers))
* )
* Note that several of the above ranges overlap.
*/
enum TokenType: uint
{
// Operators
OPERATORS_BEGIN, ///
Assign, /// $(D_KEYWORD =)
At, /// $(D_KEYWORD @)
BitAnd, /// $(D_KEYWORD &)
BitAndEquals, /// $(D_KEYWORD &=)
BitOr, /// $(D_KEYWORD |)
BitOrEquals, /// $(D_KEYWORD |=)
CatEquals, /// $(D_KEYWORD ~=)
Colon, /// $(D_KEYWORD :)
Comma, /// $(D_KEYWORD ,)
Decrement, /// $(D_KEYWORD --)
Div, /// $(D_KEYWORD /)
DivEquals, /// $(D_KEYWORD /=)
Dollar, /// $(D_KEYWORD $)
Dot, /// $(D_KEYWORD .)
Equals, /// $(D_KEYWORD ==)
GoesTo, // =>
Greater, /// $(D_KEYWORD >)
GreaterEqual, /// $(D_KEYWORD >=)
Hash, // $(D_KEYWORD #)
Increment, /// $(D_KEYWORD ++)
LBrace, /// $(D_KEYWORD {)
LBracket, /// $(D_KEYWORD [)
Less, /// $(D_KEYWORD <)
LessEqual, /// $(D_KEYWORD <=)
LessEqualGreater, // $(D_KEYWORD <>=)
LessOrGreater, /// $(D_KEYWORD <>)
LogicAnd, /// $(D_KEYWORD &&)
LogicOr, /// $(D_KEYWORD ||)
LParen, /// $(D_KEYWORD $(LPAREN))
Minus, /// $(D_KEYWORD -)
MinusEquals, /// $(D_KEYWORD -=)
Mod, /// $(D_KEYWORD %)
ModEquals, /// $(D_KEYWORD %=)
MulEquals, /// $(D_KEYWORD *=)
Not, /// $(D_KEYWORD !)
NotEquals, /// $(D_KEYWORD !=)
NotGreater, /// $(D_KEYWORD !>)
NotGreaterEqual, /// $(D_KEYWORD !>=)
NotLess, /// $(D_KEYWORD !<)
NotLessEqual, /// $(D_KEYWORD !<=)
NotLessEqualGreater, /// $(D_KEYWORD !<>)
Plus, /// $(D_KEYWORD +)
PlusEquals, /// $(D_KEYWORD +=)
Pow, /// $(D_KEYWORD ^^)
PowEquals, /// $(D_KEYWORD ^^=)
RBrace, /// $(D_KEYWORD })
RBracket, /// $(D_KEYWORD ])
RParen, /// $(D_KEYWORD $(RPAREN))
Semicolon, /// $(D_KEYWORD ;)
ShiftLeft, /// $(D_KEYWORD <<)
ShiftLeftEqual, /// $(D_KEYWORD <<=)
ShiftRight, /// $(D_KEYWORD >>)
ShiftRightEqual, /// $(D_KEYWORD >>=)
Slice, // ..
Star, /// $(D_KEYWORD *)
Ternary, /// $(D_KEYWORD ?)
Tilde, /// $(D_KEYWORD ~)
Unordered, /// $(D_KEYWORD !<>=)
UnsignedShiftRight, /// $(D_KEYWORD >>>)
UnsignedShiftRightEqual, /// $(D_KEYWORD >>>=)
Vararg, /// $(D_KEYWORD ...)
Xor, /// $(D_KEYWORD ^)
XorEquals, /// $(D_KEYWORD ^=)
OPERATORS_END, ///
// Keywords
KEYWORDS_BEGIN, ///
TYPES_BEGIN, ///
Bool, /// $(D_KEYWORD bool)
Byte, /// $(D_KEYWORD byte)
Cdouble, /// $(D_KEYWORD cdouble)
Cent, /// $(D_KEYWORD cent)
Cfloat, /// $(D_KEYWORD cfloat)
Char, /// $(D_KEYWORD char)
Creal, /// $(D_KEYWORD creal)
Dchar, /// $(D_KEYWORD dchar)
Double, /// $(D_KEYWORD double)
DString, /// $(D_KEYWORD dstring)
Float, /// $(D_KEYWORD float)
Function, /// $(D_KEYWORD function)
Idouble, /// $(D_KEYWORD idouble)
Ifloat, /// $(D_KEYWORD ifloat)
Int, /// $(D_KEYWORD int)
Ireal, /// $(D_KEYWORD ireal)
Long, /// $(D_KEYWORD long)
Real, /// $(D_KEYWORD real)
Short, /// $(D_KEYWORD short)
String, /// $(D_KEYWORD string)
Ubyte, /// $(D_KEYWORD ubyte)
Ucent, /// $(D_KEYWORD ucent)
Uint, /// $(D_KEYWORD uint)
Ulong, /// $(D_KEYWORD ulong)
Ushort, /// $(D_KEYWORD ushort)
Void, /// $(D_KEYWORD void)
Wchar, /// $(D_KEYWORD wchar)
WString, /// $(D_KEYWORD wstring)
TYPES_END, ///
ATTRIBUTES_BEGIN, ///
Align, /// $(D_KEYWORD align)
Deprecated, /// $(D_KEYWORD deprecated)
Extern, /// $(D_KEYWORD extern)
Pragma, /// $(D_KEYWORD pragma)
PROTECTION_BEGIN, ///
Export, /// $(D_KEYWORD export)
Package, /// $(D_KEYWORD package)
Private, /// $(D_KEYWORD private)
Protected, /// $(D_KEYWORD protected)
Public, /// $(D_KEYWORD public)
PROTECTION_END, ///
Abstract, /// $(D_KEYWORD abstract)
Auto, /// $(D_KEYWORD auto)
Const, /// $(D_KEYWORD const)
Final, /// $(D_KEYWORD final)
Gshared, /// $(D_KEYWORD __gshared)
Immutable, // immutable
Inout, // inout
Scope, /// $(D_KEYWORD scope)
Shared, // shared
Static, /// $(D_KEYWORD static)
Synchronized, /// $(D_KEYWORD synchronized)
ATTRIBUTES_END, ///
Alias, /// $(D_KEYWORD alias)
Asm, /// $(D_KEYWORD asm)
Assert, /// $(D_KEYWORD assert)
Body, /// $(D_KEYWORD body)
Break, /// $(D_KEYWORD break)
Case, /// $(D_KEYWORD case)
Cast, /// $(D_KEYWORD cast)
Catch, /// $(D_KEYWORD catch)
Class, /// $(D_KEYWORD class)
Continue, /// $(D_KEYWORD continue)
Debug, /// $(D_KEYWORD debug)
Default, /// $(D_KEYWORD default)
Delegate, /// $(D_KEYWORD delegate)
Delete, /// $(D_KEYWORD delete)
Do, /// $(D_KEYWORD do)
Else, /// $(D_KEYWORD else)
Enum, /// $(D_KEYWORD enum)
False, /// $(D_KEYWORD false)
Finally, /// $(D_KEYWORD finally)
Foreach, /// $(D_KEYWORD foreach)
Foreach_reverse, /// $(D_KEYWORD foreach_reverse)
For, /// $(D_KEYWORD for)
Goto, /// $(D_KEYWORD goto)
If, /// $(D_KEYWORD if)
Import, /// $(D_KEYWORD import)
In, /// $(D_KEYWORD in)
Interface, /// $(D_KEYWORD interface)
Invariant, /// $(D_KEYWORD invariant)
Is, /// $(D_KEYWORD is)
Lazy, /// $(D_KEYWORD lazy)
Macro, /// $(D_KEYWORD macro)
Mixin, /// $(D_KEYWORD mixin)
Module, /// $(D_KEYWORD module)
New, /// $(D_KEYWORD new)
Nothrow, /// $(D_KEYWORD nothrow)
Null, /// $(D_KEYWORD null)
Out, /// $(D_KEYWORD out)
Override, /// $(D_KEYWORD override)
Pure, /// $(D_KEYWORD pure)
Ref, /// $(D_KEYWORD ref)
Return, /// $(D_KEYWORD return)
Struct, /// $(D_KEYWORD struct)
Super, /// $(D_KEYWORD super)
Switch, /// $(D_KEYWORD switch)
Template, /// $(D_KEYWORD template)
This, /// $(D_KEYWORD this)
Throw, /// $(D_KEYWORD throw)
True, /// $(D_KEYWORD true)
Try, /// $(D_KEYWORD try)
Typedef, /// $(D_KEYWORD typedef)
Typeid, /// $(D_KEYWORD typeid)
Typeof, /// $(D_KEYWORD typeof)
Union, /// $(D_KEYWORD union)
Unittest, /// $(D_KEYWORD unittest)
Version, /// $(D_KEYWORD version)
Volatile, /// $(D_KEYWORD volatile)
While, /// $(D_KEYWORD while)
With, /// $(D_KEYWORD with)
KEYWORDS_END, ///
// Constants
CONSTANTS_BEGIN, ///
File, /// $(D_KEYWORD __FILE__)
Line, /// $(D_KEYWORD __LINE__)
Thread, /// $(D_KEYWORD __thread)
Traits, /// $(D_KEYWORD __traits)
CONSTANTS_END, ///
// Misc
MISC_BEGIN, ///
Comment, /// $(D_COMMENT /** comment */) or $(D_COMMENT // comment) or $(D_COMMENT ///comment)
Identifier, /// anything else
ScriptLine, // Line at the beginning of source file that starts from #!
Whitespace, /// whitespace
SpecialTokenSequence, /// #line 10 "file.d"
MISC_END, ///
// Literals
LITERALS_BEGIN, ///
NUMBERS_BEGIN, ///
DoubleLiteral, /// 123.456
FloatLiteral, /// 123.456f or 0x123_45p-3
IDoubleLiteral, /// 123.456i
IFloatLiteral, /// 123.456fi
IntLiteral, /// 123 or 0b1101010101
LongLiteral, /// 123L
RealLiteral, /// 123.456L
IRealLiteral, /// 123.456Li
UnsignedIntLiteral, /// 123u
UnsignedLongLiteral, /// 123uL
NUMBERS_END, ///
STRINGS_BEGIN, ///
DStringLiteral, /// $(D_STRING "32-bit character string"d)
StringLiteral, /// $(D_STRING "an 8-bit string")
WStringLiteral, /// $(D_STRING "16-bit character string"w)
STRINGS_END, ///
LITERALS_END, ///
}
// Implementation details follow
private:
private pure bool isNewline(R)(R range)
{
return range.front == '\n' || range.front == '\r';
}
pure bool isEoF(R)(R range)
{
return range.empty || range.front == 0 || range.front == 0x1a;
}
C[] popNewline(R, C = ElementType!R)(ref R range, ref uint index)
if (isSomeChar!C && isForwardRange!R)
{
C[] chars;
if (range.front == '\r')
{
chars ~= range.front;
range.popFront();
++index;
}
if (range.front == '\n')
{
chars ~= range.front;
range.popFront();
++index;
}
return chars;
}
unittest
{
uint i;
auto s = "\r\ntest";
assert (popNewline(s, i) == "\r\n");
assert (s == "test");
}
Token lexWhitespace(R, C = ElementType!R)(ref R range, ref uint index,
ref uint lineNumber) if (isForwardRange!R && isSomeChar!C)
{
Token t;
t.type = TokenType.Whitespace;
t.lineNumber = lineNumber;
t.startIndex = index;
auto app = appender!(C[])();
while (!isEoF(range) && std.uni.isWhite(range.front))
{
if (isNewline(range))
{
++lineNumber;
app.put(popNewline(range, index));
}
else
{
app.put(range.front);
range.popFront();
++index;
}
}
t.value = to!string(app.data);
return t;
}
unittest
{
import std.stdio;
uint lineNum = 1;
uint index;
auto chars = " \n \r\n \tabcde";
auto r = lexWhitespace(chars, index, lineNum);
assert (r.value == " \n \r\n \t");
assert (chars == "abcde");
assert (lineNum == 3);
}
Token lexComment(R, C = ElementType!R)(ref R input, ref uint index, ref uint lineNumber)
if (isSomeChar!C && isForwardRange!R)
in
{
assert (input.front == '/');
}
body
{
Token t;
t.lineNumber = lineNumber;
t.type = TokenType.Comment;
t.startIndex = index;
auto app = appender!(C[])();
app.put(input.front);
input.popFront();
switch(input.front)
{
case '/':
while (!isEoF(input) && !isNewline(input))
{
app.put(input.front);
input.popFront();
++index;
}
break;
case '*':
while (!isEoF(input))
{
if (isNewline(input))
{
app.put(popNewline(input, index));
++lineNumber;
}
else if (input.front == '*')
{
app.put(input.front);
input.popFront();
++index;
if (input.front == '/')
{
app.put(input.front);
input.popFront();
++index;
break;
}
}
else
{
app.put(input.front);
input.popFront();
++index;
}
}
break;
case '+':
int depth = 1;
while (depth > 0 && !isEoF(input))
{
if (isNewline(input))
{
app.put(popNewline(input, index));
lineNumber++;
}
else if (input.front == '+')
{
app.put(input.front);
input.popFront();
++index;
if (input.front == '/')
{
app.put(input.front);
input.popFront();
++index;
--depth;
}
}
else if (input.front == '/')
{
app.put(input.front);
input.popFront();
++index;
if (input.front == '+')
{
app.put(input.front);
input.popFront();
++index;
++depth;
}
}
else
{
app.put(input.front);
input.popFront();
++index;
}
}
break;
default:
Token errorToken;
return errorToken;
}
t.value = to!string(app.data);
return t;
}
unittest
{
uint index;
uint lineNumber = 1;
auto chars = "//this is a comment\r\nthis is not";
auto comment = lexComment(chars, index, lineNumber);
assert (chars == "\r\nthis is not");
assert (comment.value == "//this is a comment");
}
unittest
{
uint index = 0;
uint lineNumber = 1;
auto chars = "/* this is a\n\tcomment\r\n */this is not";
auto comment = lexComment(chars, index, lineNumber);
assert (chars == "this is not");
assert (comment.value == "/* this is a\n\tcomment\r\n */");
assert (lineNumber == 3);
}
unittest
{
uint index;
uint lineNumber = 1;
auto chars = "/+this is a /+c/+omm+/ent+/ \r\nthis+/ is not";
auto comment = lexComment(chars, index, lineNumber);
assert (chars == " is not");
assert (comment.value == "/+this is a /+c/+omm+/ent+/ \r\nthis+/");
assert (lineNumber == 2);
}
unittest
{
uint i;
uint l;
auto chars = "/(";
auto comment = lexComment(chars, i, l);
assert (comment == "");
}
string popDigitChars(R, C = ElementType!R, alias isInterestingDigit)(ref R input, ref uint index,
uint upTo) if (isSomeChar!C && isForwardRange!R)
{
auto app = appender!(C[])();
for (uint i = 0; i != upTo; ++i)
{
if (isInterestingDigit(input.front))
{
app.put(input.front);
input.popFront();
}
else
break;
}
return to!string(app.data);
}
string popHexChars(R)(ref R input, ref uint index, uint upTo)
{
return popDigitChars!(R, ElementType!R, isHexDigit)(input, index, upTo);
}
string popOctalChars(R)(ref R input, ref uint index, uint upTo)
{
return popDigitChars!(R, ElementType!R, isOctalDigit)(input, index, upTo);
}
unittest
{
uint i;
auto a = "124ac82d3fqwerty";
auto ra = popHexChars(a, i, uint.max);
assert (a == "qwerty");
assert (ra == "124ac82d3f");
auto b = "08a7c2e3";
auto rb = popHexChars(b, i, 4);
assert (rb.length == 4);
assert (rb == "08a7");
assert (b == "c2e3");
auto c = "00123832";
auto rc = popOctalChars(c, i, uint.max);
assert (c == "832");
assert (rc == "00123");
}
string interpretEscapeSequence(R, C = ElementType!R)(ref R input, ref uint index)
if (isSomeChar!C && isForwardRange!R)
in
{
assert(input.front == '\\');
}
body
{
input.popFront();
switch (input.front)
{
case '\'':
case '\"':
case '?':
case '\\':
case 0:
case 0x1a:
auto f = input.front;
input.popFront();
++index;
return to!string(f);
case 'a': input.popFront(); ++index; return "\a";
case 'b': input.popFront(); ++index; return "\b";
case 'f': input.popFront(); ++index; return "\f";
case 'n': input.popFront(); ++index; return "\n";
case 'r': input.popFront(); ++index; return "\r";
case 't': input.popFront(); ++index; return "\t";
case 'v': input.popFront(); ++index; return "\v";
case 'x':
input.popFront();
auto hexChars = popHexChars(input, index, 2);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case '0': .. case '7':
auto octalChars = popOctalChars(input, index, 3);
return to!string(cast(dchar) parse!uint(octalChars, 8));
case 'u':
input.popFront();
auto hexChars = popHexChars(input, index, 4);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case 'U':
input.popFront();
auto hexChars = popHexChars(input, index, 8);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case '&':
input.popFront();
++index;
auto entity = appender!(char[])();
while (!input.isEoF() && input.front != ';')
{
entity.put(input.front);
input.popFront();
++index;
}
if (!isEoF(input))
{
auto decoded = to!string(entity.data) in characterEntities;
input.popFront();
++index;
if (decoded !is null)
return to!string(*decoded);
}
return "";
default:
input.popFront();
++index;
// This is an error
return "\\";
}
}
unittest
{
uint i;
auto vals = [
"\\&": "&",
"\\n": "\n",
"\\?": "?",
"\\u0033": "\u0033",
"\\U00000076": "v",
"\\075": "=",
"\\'": "'",
"\\a": "\a",
"\\b": "\b",
"\\f": "\f",
"\\r": "\r",
"\\t": "\t",
"\\v": "\v",
"\\y": "\\",
"\\x20": " ",
"\\&eeeeeeror;": "",
];
foreach (k, v; vals)
assert (interpretEscapeSequence(k, i) == v);
}
Token lexHexString(R, C = ElementType!R)(ref R input, ref uint index, ref uint lineNumber,
const StringStyle style = StringStyle.Default)
in
{
assert (input.front == 'x');
}
body
{
Token t;
t.lineNumber = lineNumber;
t.startIndex = index;
t.type = TokenType.StringLiteral;
auto app = appender!(C[])();
if (style & StringStyle.IncludeQuotes)
app.put("x\"");
input.popFront();
input.popFront();
index += 2;
while (!input.isEoF())
{
if (isNewline(input))
{
app.put(popNewline(input, index));
++lineNumber;
}
else if (isHexDigit(input.front))
{
app.put(input.front);
input.popFront();
++index;
}
else if (std.uni.isWhite(input.front) && (style & StringStyle.NotEscaped))
{
app.put(input.front);
input.popFront();
++index;
}
else if (input.front == '"')
{
if (style & StringStyle.IncludeQuotes)
app.put('"');
input.popFront();
++index;
break;
}
else
{
// This is an error
}
}
if (!input.isEoF())
{
switch (input.front)
{
case 'w':
t.type = TokenType.WStringLiteral;
goto case 'c';
case 'd':
t.type = TokenType.DStringLiteral;
goto case 'c';
case 'c':
if (style & StringStyle.IncludeQuotes)
app.put(input.front);
input.popFront();
++index;
break;
default:
break;
}
}
if (style & StringStyle.NotEscaped)
t.value = to!string(app.data);
else
{
auto a = appender!(char[])();
foreach (b; std.range.chunks(app.data, 2))
a.put(to!string(cast(dchar) parse!uint(b, 16)));
t.value = to!string(a.data);
}
return t;
}
unittest
{
uint i;
uint l;
auto a = `x"204041"`;
auto ar = lexHexString(a, i, l);
assert (ar == " @A");
assert (ar == TokenType.StringLiteral);
auto b = `x"20"w`;
auto br = lexHexString(b, i, l);
assert (br == " ");
assert (br == TokenType.WStringLiteral);
auto c = `x"6d"`;
auto cr = lexHexString(c, i, l, StringStyle.NotEscaped);
assert (cr == "6d");
auto d = `x"5e5f"d`;
auto dr = lexHexString(d, i, l, StringStyle.NotEscaped | StringStyle.IncludeQuotes);
assert (dr == `x"5e5f"d`);
assert (dr == TokenType.DStringLiteral);
}
Token lexString(R)(ref R input, ref uint index, ref uint lineNumber,
const StringStyle style = StringStyle.Default)
in
{
assert (input.front == '\'' || input.front == '"' || input.front == '`' || input.front == 'r');
}
body
{
Token t;
t.lineNumber = lineNumber;
t.startIndex = index;
t.type = TokenType.StringLiteral;
auto app = appender!(char[])();
bool isWysiwyg = input.front == 'r' || input.front == '`';
if (input.front == 'r')
{
if (style & StringStyle.IncludeQuotes)
app.put('r');
input.popFront();
}
auto quote = input.front;
input.popFront();
++index;
if (style & StringStyle.IncludeQuotes)
app.put(quote);
while (!isEoF(input))
{
if (isNewline(input))
{
app.put(popNewline(input, index));
lineNumber++;
}
else if (input.front == '\\')
{
if (style & StringStyle.NotEscaped)
{
auto r = input.save();
r.popFront();
if (r.front == quote && !isWysiwyg)
{
app.put('\\');
app.put(quote);
input.popFront();
input.popFront();
index += 2;
}
else if (r.front == '\\' && !isWysiwyg)
{
app.put('\\');
app.put('\\');
input.popFront();
input.popFront();
index += 2;
}
else
{
app.put('\\');
input.popFront();
++index;
}
}
else
app.put(interpretEscapeSequence(input, index));
}
else if (input.front == quote)
{
if (style & StringStyle.IncludeQuotes)
app.put(quote);
input.popFront();
++index;
break;
}
else
{
app.put(input.front);
input.popFront();
++index;
}
}
if (!input.isEoF())
{
switch (input.front)
{
case 'w':
t.type = TokenType.WStringLiteral;
goto case 'c';
case 'd':
t.type = TokenType.DStringLiteral;
goto case 'c';
case 'c':
if (style & StringStyle.IncludeQuotes)
app.put(input.front);
input.popFront();
++index;
break;
default:
break;
}
}
t.value = to!string(app.data);
return t;
}
unittest
{
uint l = 1;
uint i;
auto a = `"abcde"`;
assert (lexString(a, i, l) == "abcde");
auto b = "\"ab\\ncd\"";
assert (lexString(b, i, l) == "ab\ncd");
auto c = "`abc\\ndef`";
assert (lexString(c, i, l, StringStyle.NotEscaped) == "abc\\ndef");
auto d = `"12345"w`;
assert (lexString(d, i, l).type == TokenType.WStringLiteral);
auto e = `"abc"c`;
assert (lexString(e, i, l).type == TokenType.StringLiteral);
auto f = `"abc"d`;
assert (lexString(f, i, l).type == TokenType.DStringLiteral);
auto g = "\"a\nb\"";
assert (lexString(g, i, l) == "a\nb");
}
Token lexDelimitedString(R)(ref R input, ref uint index,
ref uint lineNumber, const StringStyle stringStyle = StringStyle.Default)
in
{
assert(input.front == 'q');
}
body
{
auto app = appender!(ElementType!R[])();
Token t;
t.startIndex = index;
t.lineNumber = lineNumber;
t.type = TokenType.StringLiteral;
input.popFront(); // q
input.popFront(); // "
index += 2;
if (stringStyle & StringStyle.IncludeQuotes)
{
app.put('q');
app.put('"');
}
bool heredoc;
ElementType!R open;
ElementType!R close;
switch (input.front)
{
case '[': open = '['; close = ']'; break;
case '{': open = '{'; close = '}'; break;
case '(': open = '('; close = ')'; break;
case '<': open = '<'; close = '>'; break;
default: heredoc = true; break;
}
if (heredoc)
{
auto hereOpen = appender!(ElementType!(R)[])();
while (!input.isEoF() && !std.uni.isWhite(input.front))
{
hereOpen.put(input.front());
input.popFront();
}
if (input.isNewline())
{
++lineNumber;
input.popNewline(index);
}
// else
// this is an error
while (!input.isEoF())
{
if (isNewline(input))
{
++lineNumber;
app.put(input.popNewline(index));
}
else if (input.front == '"' && app.data.endsWith(hereOpen.data))
{
app.put('"');
++index;
input.popFront();
if (stringStyle & StringStyle.IncludeQuotes)
t.value = to!string(app.data);
else
t.value = to!string(app.data[0 .. app.data.length - hereOpen.data.length - 1]);
break;
}
else
{
app.put(input.front);
++index;
input.popFront();
}
}
}
else
{
if (stringStyle & StringStyle.IncludeQuotes)
app.put(input.front);
input.popFront();
int depth = 1;
while (depth > 0 && !input.isEoF())
{
if (isNewline(input))
app.put(popNewline(input, index));
else
{
if (input.front == close)
{
--depth;
if (depth == 0)
{
if (stringStyle & StringStyle.IncludeQuotes)
{
app.put(close);
app.put('"');
}
input.popFront();
input.popFront();
break;
}
}
else if (input.front == open)
++depth;
app.put(input.front);
input.popFront();
++index;
}
}
}
if (!input.isEoF())
{
switch (input.front)
{
case 'w':
t.type = TokenType.WStringLiteral;
goto case 'c';
case 'd':
t.type = TokenType.DStringLiteral;
goto case 'c';
case 'c':
if (stringStyle & StringStyle.IncludeQuotes)
app.put(input.front);
input.popFront();
++index;
break;
default:
break;
}
}
if (t.value is null)
t.value = to!string(app.data);
return t;
}
unittest
{
uint i;
uint l;
auto a = `q"{abc{}de}"`;
auto ar = lexDelimitedString(a, i, l);
assert (ar == "abc{}de");
assert (ar == TokenType.StringLiteral);
auto b = "q\"abcde\n123\nabcde\"w";
auto br = lexDelimitedString(b, i, l);
assert (br == "123\n");
assert (br == TokenType.WStringLiteral);
auto c = `q"[