// Written in the D programming language /** * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on * ASCII and UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * * To use the _lexer, create a LexerConfig struct * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; * config.tokenStyle = IterationStyle.source; * config.versionNumber = 2061; * config.vendorString = "Lexer Example"; * --- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your * source code, passing in the configuration. * --- * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); * --- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can * be used easily with the algorithms from std.algorithm or iterated over with * $(D_KEYWORD foreach) * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); * assert (tokens.front.line == 1); * assert (tokens.front.startIndex == 0); * --- * * Examples: * * Generate HTML markup of D code. * --- * module highlighter; * * import std.stdio; * import std.array; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * * // http://ethanschoonover.com/solarized * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * *
* * * * *]");
*
* foreach (Token t; tokens)
* {
* if (isType(t.type))
* writeSpan("type", t.value);
* else if (isKeyword(t.type))
* writeSpan("kwrd", t.value);
* else if (t.type == TokenType.comment)
* writeSpan("com", t.value);
* else if (isStringLiteral(t.type))
* writeSpan("str", t.value);
* else if (isNumberLiteral(t.type))
* writeSpan("num", t.value);
* else if (isOperator(t.type))
* writeSpan("op", t.value);
* else
* stdout.write(t.value.replace("<", "<"));
* }
* stdout.writeln("\n");
* }
*
* void main(string[] args)
* {
* LexerConfig config;
* config.tokenStyle = TokenStyle.source;
* config.iterStyle = IterationStyle.everything;
* config.fileName = args[1];
* auto f = File(args[1]);
* (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
* }
* ---
*
* Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott
* Source: $(PHOBOSSRC std/d/_lexer.d)
*/
module std.d.lexer;
import std.algorithm;
import std.ascii;
import std.conv;
import std.d.entities;
import std.datetime;
import std.exception;
import std.range;
import std.string;
import std.traits;
import std.uni;
import std.utf;
import std.regex;
import std.container;
public:
/**
* Represents a D token
*/
struct Token
{
/**
* The token type.
*/
TokenType type;
/**
* The representation of the token in the original source code.
*/
string value;
/**
* The number of the line the token is on.
*/
uint line;
/**
* The column number of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/
uint column;
/**
* The index of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/
uint startIndex;
/**
* Check to see if the token is of the same type and has the same string
* representation as the given token.
*/
bool opEquals(ref const(Token) other) const
{
return other.type == type && other.value == value;
}
/**
* Checks to see if the token's string representation is equal to the given
* string.
*/
bool opEquals(string value) const { return this.value == value; }
/**
* Checks to see if the token is of the given type.
*/
bool opEquals(TokenType type) const { return type == type; }
/**
* Comparison operator orders tokens by start index.
*/
int opCmp(size_t i) const
{
if (startIndex < i) return -1;
if (startIndex > i) return 1;
return 0;
}
}
/**
* Configure the behavior of the byToken() function. These flags may be
* combined using a bitwise or.
*/
enum IterationStyle
{
/// Only include code, not whitespace or comments
codeOnly = 0,
/// Includes comments
includeComments = 0b0001,
/// Includes whitespace
includeWhitespace = 0b0010,
/// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
includeSpecialTokens = 0b0100,
/// Do not stop iteration on reaching the ___EOF__ token
ignoreEOF = 0b1000,
/// Include everything
everything = includeComments | includeWhitespace | ignoreEOF
}
/**
* Configuration of the token lexing style. These flags may be combined with a
* bitwise or.
*/
enum TokenStyle : uint
{
/**
* Escape sequences will be replaced with their equivalent characters,
* enclosing quote characters will not be included. Special tokens such as
* __VENDOR__ will be replaced with their equivalent strings. Useful for
* creating a compiler or interpreter.
*/
default_ = 0b0000,
/**
* Escape sequences will not be processed. An escaped quote character will
* not terminate string lexing, but it will not be replaced with the quote
* character in the token.
*/
notEscaped = 0b0001,
/**
* Strings will include their opening and closing quote characters as well
* as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
* include the $(D_STRING 'w') character as well as the opening and closing
* quotes$(RPAREN)
*/
includeQuotes = 0b0010,
/**
* Do not replace the value field of the special tokens such as ___DATE__
* with their string equivalents.
*/
doNotReplaceSpecial = 0b0100,
/**
* Strings will be read exactly as they appeared in the source, including
* their opening and closing quote characters. Useful for syntax
* highlighting.
*/
source = notEscaped | includeQuotes | doNotReplaceSpecial
}
/**
* Lexer configuration
*/
struct LexerConfig
{
/**
* Iteration style
*/
IterationStyle iterStyle = IterationStyle.codeOnly;
/**
* Token style
*/
TokenStyle tokenStyle = tokenStyle.default_;
/**
* Replacement for the ___VERSION__ token. Defaults to 1.
*/
uint versionNumber = 100;
/**
* Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer")
*/
string vendorString = "std.d.lexer";
/**
* Name used when creating error messages that are sent to errorFunc. This
* is needed because the lexer operates on any forwarad range of ASCII
* characters or UTF-8 code units and does not know what to call its input
* source. Defaults to the empty string.
*/
string fileName = "";
/**
* This function is called when an error is encountered during lexing.
* Parameters are file name, code uint index, line number, column,
* and error messsage.
*/
void delegate(string, uint, uint, uint, string) errorFunc;
/**
* Initial size of the lexer's internal token buffer in bytes. The lexer
* will grow this buffer if necessary.
*/
size_t bufferSize = 1024 * 4;
}
/**
* Iterate over the given range of characters by D tokens.
* Params:
* range = the range of characters
* config = the lexer configuration
* Returns:
* an input range of tokens
*/
TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R))
{
auto r = TokenRange!(R)(range);
r.config = config;
r.lineNumber = 1;
r.popFront();
return r;
}
/**
* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
*/
struct TokenRange(R) if (isForwardRange!(R))
{
/**
* Returns: true if the range is empty
*/
bool empty() const @property
{
return _empty;
}
/**
* Returns: the current token
*/
Token front() const @property
{
enforce(!_empty, "Cannot call front() on empty token range");
return current;
}
/**
* Returns the current token and then removes it from the range
*/
Token moveFront()
{
auto r = front();
popFront();
return r;
}
/**
* Range operation
*/
int opApply(int delegate(Token) dg)
{
int result = 0;
while (!empty)
{
result = dg(front);
if (result)
break;
popFront();
}
return result;
}
/**
* Range operation
*/
int opApply(int delegate(size_t, Token) dg)
{
int result = 0;
int i = 0;
while (!empty)
{
result = dg(i, front);
if (result)
break;
popFront();
}
return result;
}
/**
* Removes the current token from the range
*/
void popFront()
{
// Filter out tokens we don't care about
loop: do
{
advance();
switch (current.type)
{
case TokenType.whitespace:
if (config.iterStyle & IterationStyle.includeWhitespace)
break loop;
break;
case TokenType.comment:
if (config.iterStyle & IterationStyle.includeComments)
break loop;
break;
case TokenType.specialTokenSequence:
if (config.iterStyle & IterationStyle.includeSpecialTokens)
break loop;
break;
default:
break loop;
}
}
while (!empty());
}
private:
this(ref R range)
{
this.range = range;
buffer = new ubyte[config.bufferSize];
cache.initialize();
}
/*
* Advances the range to the next token
*/
void advance()
{
if (range.empty)
{
_empty = true;
return;
}
bufferIndex = 0;
current.line = lineNumber;
current.startIndex = index;
current.column = column;
current.value = null;
if (std.ascii.isWhite(range.front))
{
lexWhitespace();
return;
}
outer: switch (range.front)
{
// pragma(msg, generateCaseTrie(
mixin(generateCaseTrie(
"=", "TokenType.assign",
"@", "TokenType.at",
"&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEquals",
"|", "TokenType.bitOr",
"|=", "TokenType.bitOrEquals",
"~=", "TokenType.catEquals",
":", "TokenType.colon",
",", "TokenType.comma",
"--", "TokenType.decrement",
"$", "TokenType.dollar",
"==", "TokenType.equals",
"=>", "TokenType.goesTo",
">", "TokenType.greater",
">=", "TokenType.greaterEqual",
"++", "TokenType.increment",
"{", "TokenType.lBrace",
"[", "TokenType.lBracket",
"<", "TokenType.less",
"<=", "TokenType.lessEqual",
"<>=", "TokenType.lessEqualGreater",
"<>", "TokenType.lessOrGreater",
"&&", "TokenType.logicAnd",
"||", "TokenType.logicOr",
"(", "TokenType.lParen",
"-", "TokenType.minus",
"-=", "TokenType.minusEquals",
"%", "TokenType.mod",
"%=", "TokenType.modEquals",
"*=", "TokenType.mulEquals",
"!", "TokenType.not",
"!=", "TokenType.notEquals",
"!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus",
"+=", "TokenType.plusEquals",
"^^", "TokenType.pow",
"^^=", "TokenType.powEquals",
"}", "TokenType.rBrace",
"]", "TokenType.rBracket",
")", "TokenType.rParen",
";", "TokenType.semicolon",
"<<", "TokenType.shiftLeft",
"<<=", "TokenType.shiftLeftEqual",
">>", "TokenType.shiftRight",
">>=", "TokenType.shiftRightEqual",
"*", "TokenType.star",
"?", "TokenType.ternary",
"~", "TokenType.tilde",
"!<>=", "TokenType.unordered",
">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual",
"^", "TokenType.xor",
"^=", "TokenType.xorEquals",
// "bool", "TokenType.bool_",
// "byte", "TokenType.byte_",
// "cdouble", "TokenType.cdouble_",
// "cent", "TokenType.cent_",
// "cfloat", "TokenType.cfloat_",
// "char", "TokenType.char_",
// "creal", "TokenType.creal_",
// "dchar", "TokenType.dchar_",
// "double", "TokenType.double_",
// "dstring", "TokenType.dstring_",
// "float", "TokenType.float_",
// "function", "TokenType.function_",
// "idouble", "TokenType.idouble_",
// "ifloat", "TokenType.ifloat_",
// "int", "TokenType.int_",
// "ireal", "TokenType.ireal_",
// "long", "TokenType.long_",
// "real", "TokenType.real_",
// "short", "TokenType.short_",
// "string", "TokenType.string_",
// "ubyte", "TokenType.ubyte_",
// "ucent", "TokenType.ucent_",
// "uint", "TokenType.uint_",
// "ulong", "TokenType.ulong_",
// "ushort", "TokenType.ushort_",
// "void", "TokenType.void_",
// "wchar", "TokenType.wchar_",
// "wstring", "TokenType.wstring_",
// "align", "TokenType.align_",
// "deprecated", "TokenType.deprecated_",
// "extern", "TokenType.extern_",
// "pragma", "TokenType.pragma_",
// "export", "TokenType.export_",
// "package", "TokenType.package_",
// "private", "TokenType.private_",
// "protected", "TokenType.protected_",
// "public", "TokenType.public_",
// "abstract", "TokenType.abstract_",
// "auto", "TokenType.auto_",
// "const", "TokenType.const_",
// "final", "TokenType.final_",
// "__gshared", "TokenType.gshared",
// "immutable", "TokenType.immutable_",
// "inout", "TokenType.inout_",
// "scope", "TokenType.scope_",
// "shared", "TokenType.shared_",
// "static", "TokenType.static_",
// "synchronized", "TokenType.synchronized_",
// "alias", "TokenType.alias_",
// "asm", "TokenType.asm_",
// "assert", "TokenType.assert_",
// "body", "TokenType.body_",
// "break", "TokenType.break_",
// "case", "TokenType.case_",
// "cast", "TokenType.cast_",
// "catch", "TokenType.catch_",
// "class", "TokenType.class_",
// "continue", "TokenType.continue_",
// "debug", "TokenType.debug_",
// "default", "TokenType.default_",
// "delegate", "TokenType.delegate_",
// "delete", "TokenType.delete_",
// "do", "TokenType.do_",
// "else", "TokenType.else_",
// "enum", "TokenType.enum_",
// "false", "TokenType.false_",
// "finally", "TokenType.finally_",
// "foreach", "TokenType.foreach_",
// "foreach_reverse", "TokenType.foreach_reverse_",
// "for", "TokenType.for_",
// "goto", "TokenType.goto_",
// "if", "TokenType.if_",
// "import", "TokenType.import_",
// "in", "TokenType.in_",
// "interface", "TokenType.interface_",
// "invariant", "TokenType.invariant_",
// "is", "TokenType.is_",
// "lazy", "TokenType.lazy_",
// "macro", "TokenType.macro_",
// "mixin", "TokenType.mixin_",
// "module", "TokenType.module_",
// "new", "TokenType.new_",
// "nothrow", "TokenType.nothrow_",
// "null", "TokenType.null_",
// "out", "TokenType.out_",
// "override", "TokenType.override_",
// "pure", "TokenType.pure_",
// "ref", "TokenType.ref_",
// "return", "TokenType.return_",
// "struct", "TokenType.struct_",
// "super", "TokenType.super_",
// "switch", "TokenType.switch_",
// "template", "TokenType.template_",
// "this", "TokenType.this_",
// "throw", "TokenType.throw_",
// "true", "TokenType.true_",
// "try", "TokenType.try_",
// "typedef", "TokenType.typedef_",
// "typeid", "TokenType.typeid_",
// "typeof", "TokenType.typeof_",
// "union", "TokenType.union_",
// "unittest", "TokenType.unittest_",
// "version", "TokenType.version_",
// "volatile", "TokenType.volatile_",
// "while", "TokenType.while_",
// "with", "TokenType.with_",
// "__DATE__", "TokenType.date",
// "__EOF__", "TokenType.eof",
// "__TIME__", "TokenType.time",
// "__TIMESTAMP__", "TokenType.timestamp",
// "__VENDOR__", "TokenType.vendor",
// "__VERSION__", "TokenType.compilerVersion",
// "__FILE__", "TokenType.file",
// "__LINE__", "TokenType.line",
// "__traits", "TokenType.traits",
// "__parameters", "TokenType.parameters",
// "__vector", "TokenType.vector",
));
case '/':
auto r = range.save();
r.popFront();
if (r.isEoF())
{
current.type = TokenType.div;
current.value = "/";
range.popFront();
++index;
break;
}
switch (r.front)
{
case '/':
case '*':
case '+':
lexComment();
break outer;
case '=':
current.type = TokenType.divEquals;
current.value = "/=";
range.popFront();
range.popFront();
index += 2;
break outer;
default:
current.type = TokenType.div;
current.value = "/";
++index;
range.popFront();
break outer;
}
case '.':
auto r = range.save();
r.popFront();
if (r.isEoF())
{
current.type = TokenType.dot;
current.value = getTokenValue(TokenType.dot);
range.popFront();
++index;
break outer;
}
else if (r.front >= '0' && r.front <= '9')
{
lexNumber();
break outer;
}
else if (r.front == '.')
{
current.type = TokenType.slice;
r.popFront();
if (r.front == '.')
{
current.type = TokenType.vararg;
range.popFront();
range.popFront();
range.popFront();
index += 3;
}
else
{
range.popFront();
range.popFront();
index += 2;
}
current.value = getTokenValue(current.type);
}
else
{
range.popFront();
current.type = TokenType.dot;
current.value = getTokenValue(TokenType.dot);
}
break;
case '0': .. case '9':
lexNumber();
break;
case '\'':
case '"':
case '`':
lexString();
break;
case 'q':
auto r = range.save;
r.popFront();
if (!r.isEoF() && r.front == '{')
{
lexTokenString();
break;
}
else if (!r.isEoF() && r.front == '"')
{
lexDelimitedString();
break;
}
else
goto default;
case 'r':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
{
lexString();
break;
}
else
goto default;
case 'x':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
{
lexHexString();
break;
}
else
goto default;
case '#':
lexSpecialTokenSequence();
break;
default:
while(!range.isEoF() && !isSeparating(range.front))
{
keepChar();
}
current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]);
current.value = getTokenValue(current.type);
if (current.value is null)
setTokenValue();
if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof)
{
_empty = true;
return;
}
if (!(config.iterStyle & TokenStyle.doNotReplaceSpecial))
break;
switch (current.type)
{
case TokenType.date:
current.type = TokenType.stringLiteral;
auto time = Clock.currTime();
current.value = format("%s %02d %04d", time.month, time.day, time.year);
break;
case TokenType.time:
auto time = Clock.currTime();
current.type = TokenType.stringLiteral;
current.value = (cast(TimeOfDay)(time)).toISOExtString();
break;
case TokenType.timestamp:
auto time = Clock.currTime();
auto dt = cast(DateTime) time;
current.type = TokenType.stringLiteral;
current.value = format("%s %s %02d %02d:%02d:%02d %04d",
dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute,
dt.second, dt.year);
break;
case TokenType.vendor:
current.type = TokenType.stringLiteral;
current.value = config.vendorString;
break;
case TokenType.compilerVersion:
current.type = TokenType.stringLiteral;
current.value = format("%d", config.versionNumber);
break;
case TokenType.line:
current.type = TokenType.intLiteral;
current.value = format("%d", current.line);
break;
case TokenType.file:
current.type = TokenType.stringLiteral;
current.value = config.fileName;
break;
default:
break;
}
break;
}
}
void lexWhitespace()
{
current.type = TokenType.whitespace;
while (!isEoF(range) && std.ascii.isWhite(range.front))
{
keepChar();
}
if (config.iterStyle & IterationStyle.includeWhitespace)
setTokenValue();
}
void lexComment()
in
{
assert (range.front == '/');
}
body
{
current.type = TokenType.comment;
keepChar();
switch(range.front)
{
case '/':
while (!isEoF(range) && !isNewline(range))
{
keepChar();
}
break;
case '*':
while (!isEoF(range))
{
if (range.front == '*')
{
keepChar();
if (range.front == '/')
{
keepChar();
break;
}
}
else
keepChar();
}
break;
case '+':
int depth = 1;
while (depth > 0 && !isEoF(range))
{
if (range.front == '+')
{
keepChar();
if (range.front == '/')
{
keepChar();
--depth;
}
}
else if (range.front == '/')
{
keepChar();
if (range.front == '+')
{
keepChar();
++depth;
}
}
else
keepChar();
}
break;
default:
assert(false);
}
if (config.iterStyle & IterationStyle.includeComments)
setTokenValue();
}
void lexHexString()
in
{
assert (range.front == 'x');
}
body
{
current.type = TokenType.stringLiteral;
keepChar();
keepChar();
while (true)
{
if (range.isEoF())
{
errorMessage("Unterminated hex string literal");
return;
}
else if (isHexDigit(range.front))
{
keepChar();
}
else if (std.ascii.isWhite(range.front) && (config.tokenStyle & TokenStyle.notEscaped))
{
keepChar();
}
else if (range.front == '"')
{
keepChar();
break;
}
else
{
errorMessage(format("Invalid character '%s' in hex string literal",
cast(char) range.front));
return;
}
}
lexStringSuffix();
if (config.tokenStyle & TokenStyle.notEscaped)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(bufferIndex - 1, 2);
}
else
{
auto a = appender!(ubyte[])();
foreach (b; std.range.chunks(buffer[2 .. bufferIndex - 1], 2))
{
string s = to!string(cast(char[]) b);
a.put(cast(ubyte[]) to!string(cast(dchar) parse!uint(s, 16)));
}
current.value = to!string(cast(char[]) a.data);
}
}
void lexNumber()
in
{
assert(isDigit(cast(char) range.front) || range.front == '.');
}
body
{
// hex and binary can start with zero, anything else is decimal
if (range.front != '0')
lexDecimal();
else
{
auto r = range.save();
r.popFront();
switch (r.front)
{
case 'x':
case 'X':
keepChar();
keepChar();
lexHex();
break;
case 'b':
case 'B':
keepChar();
keepChar();
lexBinary();
break;
default:
lexDecimal();
return;
}
}
}
void lexFloatSuffix()
{
switch (range.front)
{
case 'L':
keepChar();
current.type = TokenType.doubleLiteral;
break;
case 'f':
case 'F':
keepChar();
current.type = TokenType.floatLiteral;
break;
default:
break;
}
if (!range.isEoF() && range.front == 'i')
{
keepChar();
if (current.type == TokenType.floatLiteral)
current.type = TokenType.ifloatLiteral;
else
current.type = TokenType.idoubleLiteral;
}
}
void lexIntSuffix()
{
bool foundU;
bool foundL;
while (!range.isEoF())
{
switch (range.front)
{
case 'u':
case 'U':
if (foundU)
return;
switch (current.type)
{
case TokenType.intLiteral:
current.type = TokenType.uintLiteral;
keepChar();
break;
case TokenType.longLiteral:
current.type = TokenType.ulongLiteral;
keepChar();
break;
default:
return;
}
foundU = true;
break;
case 'L':
if (foundL)
return;
switch (current.type)
{
case TokenType.intLiteral:
current.type = TokenType.longLiteral;
keepChar();
break;
case TokenType.uintLiteral:
current.type = TokenType.ulongLiteral;
keepChar();
break;
default:
return;
}
foundL = true;
break;
default:
return;
}
}
}
void lexExponent()
in
{
assert (range.front == 'e' || range.front == 'E' || range.front == 'p'
|| range.front == 'P');
}
body
{
keepChar();
bool foundSign = false;
while (!range.isEoF())
{
switch (range.front)
{
case '-':
case '+':
if (foundSign)
return;
foundSign = true;
keepChar();
case '0': .. case '9':
case '_':
keepChar();
break;
case 'L':
case 'f':
case 'F':
case 'i':
lexFloatSuffix();
return;
default:
return;
}
}
}
void lexDecimal()
in
{
assert ((range.front >= '0' && range.front <= '9') || range.front == '.');
}
body
{
bool foundDot = false;
current.type = TokenType.intLiteral;
scope(exit) setTokenValue();
decimalLoop: while (!range.isEoF())
{
switch (range.front)
{
case '0': .. case '9':
case '_':
keepChar();
break;
case 'i':
case 'L':
if (foundDot)
{
lexFloatSuffix();
return;
}
else
{
lexIntSuffix();
return;
}
case 'f':
case 'F':
lexFloatSuffix();
return;
case 'e':
case 'E':
lexExponent();
return;
case '.':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '.')
break decimalLoop; // possibly slice expression
if (foundDot)
break decimalLoop; // two dots with other characters between them
keepChar();
foundDot = true;
current.type = TokenType.doubleLiteral;
break;
default:
break decimalLoop;
}
}
}
void lexBinary()
{
current.type = TokenType.intLiteral;
scope(exit) setTokenValue();
binaryLoop: while (!range.isEoF())
{
switch (range.front)
{
case '0':
case '1':
case '_':
keepChar();
break;
case 'u':
case 'U':
case 'L':
lexIntSuffix();
return;
default:
break binaryLoop;
}
}
}
void lexHex()
{
current.type = TokenType.intLiteral;
scope(exit) setTokenValue();
bool foundDot;
hexLoop: while (!range.isEoF())
{
switch (range.front)
{
case 'a': .. case 'f':
case 'A': .. case 'F':
case '0': .. case '9':
case '_':
keepChar();
break;
case 'i':
case 'L':
if (foundDot)
{
lexFloatSuffix();
return;
}
else
{
lexIntSuffix();
return;
}
case 'p':
case 'P':
lexExponent();
return;
case '.':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '.')
break hexLoop; // slice expression
if (foundDot)
break hexLoop; // two dots with other characters between them
keepChar();
foundDot = true;
current.type = TokenType.doubleLiteral;
break;
default:
break hexLoop;
}
}
}
void lexStringSuffix()
{
current.type = TokenType.stringLiteral;
if (!range.isEoF())
{
switch (range.front)
{
case 'w':
current.type = TokenType.wstringLiteral;
goto case 'c';
case 'd':
current.type = TokenType.dstringLiteral;
goto case 'c';
case 'c':
keepChar();
break;
default:
break;
}
}
}
void lexString()
in
{
assert (range.front == '\'' || range.front == '"' || range.front == '`' || range.front == 'r');
}
body
{
current.type = TokenType.stringLiteral;
bool isWysiwyg = range.front == 'r' || range.front == '`';
if (range.front == 'r')
keepChar();
scope (exit)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
{
if (buffer[0] == 'r')
setTokenValue(bufferIndex - 1, 2);
else
setTokenValue(bufferIndex - 1, 1);
}
}
auto quote = range.front;
keepChar();
while (true)
{
if (range.isEoF())
{
errorMessage("Unterminated string literal");
return;
}
else if (range.front == '\\' && !isWysiwyg)
{
if (config.tokenStyle & TokenStyle.notEscaped)
{
auto r = range.save();
r.popFront();
if (r.front == quote && !isWysiwyg)
{
keepChar();
keepChar();
}
else if (r.front == '\\' && !isWysiwyg)
{
keepChar();
keepChar();
}
else
keepChar();
}
else
interpretEscapeSequence(range, index, buffer, bufferIndex);
}
else if (range.front == quote)
{
keepChar();
break;
}
else
keepChar();
}
lexStringSuffix();
}
void lexDelimitedString()
in
{
assert(range.front == 'q');
}
body
{
current.type = TokenType.stringLiteral;
keepChar();
keepChar();
bool heredoc;
ubyte open;
ubyte close;
switch (range.front)
{
case '[': open = '['; close = ']'; break;
case '{': open = '{'; close = '}'; break;
case '(': open = '('; close = ')'; break;
case '<': open = '<'; close = '>'; break;
default: heredoc = true; break;
}
if (heredoc)
lexHeredocString();
else
lexNormalDelimitedString(open, close);
}
void lexNormalDelimitedString(ubyte open, ubyte close)
in
{
assert(buffer[0 .. bufferIndex] == "q\"");
}
body
{
current.type = TokenType.stringLiteral;
int depth = 1;
keepChar();
scope (exit)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(bufferIndex - 2, 3);
}
while (true)
{
if (range.isEoF())
errorMessage("Unterminated string literal");
if (range.front == open)
{
keepChar();
++depth;
}
else if (range.front == close)
{
keepChar();
--depth;
if (depth <= 0)
{
auto r = range.save();
if (r.front == '"')
{
keepChar();
return;
}
else
{
errorMessage("Expected \" after balanced "
~ cast(char) close ~ " but found "
~ cast(char) r.front ~ " instead.");
return;
}
}
}
else
keepChar();
}
}
void lexHeredocString()
in
{
assert (buffer[0 .. bufferIndex] == "q\"");
}
body
{
auto i = bufferIndex;
while (true)
{
if (range.isEoF())
{
errorMessage("Unterminated string literal");
return;
}
else if (isNewline(range))
{
keepChar();
break;
}
else if (isSeparating(range.front))
{
errorMessage("Unterminated string literal - Separating");
return;
}
else
keepChar();
}
auto ident = buffer[i .. bufferIndex - 1];
scope(exit)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
{
size_t b = 2 + ident.length;
if (buffer[b] == '\r') ++b;
if (buffer[b] == '\n') ++b;
size_t e = bufferIndex;
if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w')
--e;
setTokenValue(e, b);
}
}
while (true)
{
if (range.isEoF())
{
errorMessage("Unterminated string literal -- a");
return;
}
else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident)
{
if (range.front == '"')
{
keepChar();
lexStringSuffix();
return;
}
else
{
errorMessage("Unterminated string literal -- b");
return;
}
}
else
keepChar();
}
}
void lexTokenString()
in
{
assert (range.front == 'q');
}
body
{
current.type = TokenType.stringLiteral;
size_t i;
scope (exit)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(bufferIndex - 1, 2);
}
keepChar();
keepChar();
LexerConfig c;
c.iterStyle = IterationStyle.everything;
c.tokenStyle = TokenStyle.source;
auto r = byToken(range, c);
r.index = index;
int depth = 1;
while (!r.empty)
{
if (r.front.type == TokenType.lBrace)
{
++depth;
}
else if (r.front.type == TokenType.rBrace)
{
--depth;
if (depth <= 0)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
{
if (bufferIndex >= buffer.length)
buffer.length += 1024;
buffer[bufferIndex++] = '}';
}
r.popFront();
break;
}
}
if (bufferIndex + r.front.value.length > buffer.length)
buffer.length += 1024;
buffer[bufferIndex .. bufferIndex + r.front.value.length] = cast(ubyte[]) r.front.value;
bufferIndex += r.front.value.length;
r.popFront();
}
lexStringSuffix();
}
void lexSpecialTokenSequence()
in
{
assert (range.front == '#');
}
body
{
keepChar();
auto r = range.save();
auto app = appender!(ubyte[])();
app.put('#');
while (true)
{
if (r.isEoF())
{
errorMessage("Found EOF when interpreting special token sequence");
return;
}
else if (isNewline(r))
break;
else
{
app.put(r.front);
r.popFront();
}
}
auto m = match((cast(char[]) app.data),
`#line\s+(?P