// Written in the D programming language /** * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on * ASCII or UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * * To use the _lexer, create a LexerConfig struct * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; * config.tokenStyle = TokenStyle.source; * config.versionNumber = 2064; * config.vendorString = "Lexer Example"; * --- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your * source code, passing in the configuration. * --- * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); * --- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can * be easily used with the algorithms from std.algorithm or iterated over with * $(D_KEYWORD foreach) * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); * assert (tokens.front.line == 1); * assert (tokens.front.startIndex == 0); * --- * * Examples: * * Generate HTML markup of D code. * --- * module highlighter; * * import std.stdio; * import std.array; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * // http://ethanschoonover.com/solarized * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * *
* * * * *]");
*
* foreach (Token t; tokens)
* {
* if (isBuiltType(t.type))
* writeSpan("type", t.value);
* else if (isKeyword(t.type))
* writeSpan("kwrd", t.value);
* else if (t.type == TokenType.comment)
* writeSpan("com", t.value);
* else if (isStringLiteral(t.type))
* writeSpan("str", t.value);
* else if (isNumberLiteral(t.type))
* writeSpan("num", t.value);
* else if (isOperator(t.type))
* writeSpan("op", t.value);
* else
* stdout.write(t.value.replace("<", "<"));
* }
* stdout.writeln("\n");
* }
*
* void main(string[] args)
* {
* LexerConfig config;
* config.tokenStyle = TokenStyle.source;
* config.iterStyle = IterationStyle.everything;
* config.fileName = args[1];
* auto f = File(args[1]);
* (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
* }
* ---
*
* Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott, Dmitry Olshansky
* Source: $(PHOBOSSRC std/d/_lexer.d)
*/
module stdx.d.lexer;
import std.algorithm;
import std.ascii;
import std.conv;
import std.datetime;
import stdx.d.entities;
import std.exception;
import std.range;
import std.regex;
import std.string;
import std.traits;
import std.utf;
version (unittest) import std.stdio;
public:
/**
* Represents a D token
*/
struct Token
{
/**
* The representation of the token in the original source code.
*/
string value;
/**
* The index of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/
size_t startIndex;
/**
* The number of the line the token is on.
*/
uint line;
/**
* The column number of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/
ushort column;
/**
* The token type.
*/
TokenType type;
/**
* Check to see if the token is of the same type and has the same string
* representation as the given token.
*/
bool opEquals(ref const(Token) other) const nothrow pure
{
return other.type == type && other.value == value;
}
/**
* Checks to see if the token's string representation is equal to the given
* string.
*/
bool opEquals(string value) const nothrow pure
{
return this.value == value;
}
/**
* Checks to see if the token is of the given type.
*/
bool opEquals(TokenType type) const nothrow pure
{
return this.type == type;
}
/**
* Comparison operator orders tokens by start index.
*/
int opCmp(ref const(Token) other) const nothrow pure
{
if (startIndex < other.startIndex) return -1;
if (startIndex > other.startIndex) return 1;
return 0;
}
int opCmp(size_t index) const nothrow pure
{
if (startIndex < index) return -1;
if (startIndex > index) return 1;
return 0;
}
}
/**
* Configure the behavior of the byToken() function. These flags may be
* combined using a bitwise or.
*/
enum IterationStyle : ushort
{
/// Only include code, not whitespace or comments
codeOnly = 0,
/// Includes comments
includeComments = 0b0001,
/// Includes whitespace
includeWhitespace = 0b0010,
/// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
includeSpecialTokens = 0b0100,
/// Do not stop iteration on reaching the $(D_KEYWORD ___EOF__) token
ignoreEOF = 0b1000,
/// Include _everything
everything = includeComments | includeWhitespace | ignoreEOF
}
/**
* Configuration of the token lexing style. These flags may be combined with a
* bitwise or.
*/
enum TokenStyle : ushort
{
/**
* Escape sequences will be replaced with their equivalent characters,
* enclosing quote characters will not be included. Special tokens such as
* $(D_KEYWORD ___VENDOR__) will be replaced with their equivalent strings.
* Useful for creating a compiler or interpreter.
*/
default_ = 0b0000,
/**
* Escape sequences will not be processed. An escaped quote character will
* not terminate string lexing, but it will not be replaced with the quote
* character in the token.
*/
notEscaped = 0b0001,
/**
* Strings will include their opening and closing quote characters as well
* as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
* include the $(D_STRING 'w') character as well as the opening and closing
* quotes$(RPAREN)
*/
includeQuotes = 0b0010,
/**
* Do not replace the value field of the special tokens such as
* $(D_KEYWORD ___DATE__) with their string equivalents.
*/
doNotReplaceSpecial = 0b0100,
/**
* Strings will be read exactly as they appeared in the source, including
* their opening and closing quote characters. Useful for syntax
* highlighting.
*/
source = notEscaped | includeQuotes | doNotReplaceSpecial
}
/**
* Lexer configuration
*/
struct LexerConfig
{
/**
* Iteration style
*/
IterationStyle iterStyle = IterationStyle.codeOnly;
/**
* Token style
*/
TokenStyle tokenStyle = tokenStyle.default_;
/**
* Replacement for the $(D_KEYWORD ___VERSION__) token. Defaults to 100.
*/
uint versionNumber = 100;
/**
* Replacement for the $(D_KEYWORD ___VENDOR__) token. Defaults to $(D_STRING "std.d.lexer")
*/
string vendorString = "std.d.lexer";
/**
* Name used when creating error messages that are sent to errorFunc. This
* is needed because the lexer operates on any forwarad range of ASCII
* characters or UTF-8 code units and does not know what to call its input
* source. Defaults to the empty string.
*/
string fileName = "";
/**
* This function is called when an error is encountered during lexing.
* Parameters are file name, code uint index, line number, column,
* and error messsage.
*/
void delegate(string, size_t, uint, uint, string) errorFunc;
}
/**
* Iterate over the given range of characters by D tokens.
* Params:
* range = the range of characters
* config = the lexer configuration
* bufferSize = initial size of internal circular buffer
* Returns:
* an input range of tokens
*/
auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024)
if (isForwardRange!(R) && !isRandomAccessRange!(R)
&& is(ElementType!R : const(ubyte)))
{
// 4K of circular buffer by default
auto r = TokenRange!(typeof(lexerSource(range)))
(lexerSource(range, bufferSize), config);
r.config = config;
r.lineNumber = 1;
r.popFront();
return r;
}
///ditto
auto byToken(R)(R range, LexerConfig config)
if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte)))
{
auto r = TokenRange!(typeof(lexerSource(range)))
(lexerSource(range), config);
r.config = config;
r.lineNumber = 1;
r.popFront();
return r;
}
/**
* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
*/
struct TokenRange(LexSrc)
//if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource
{
/**
* Returns: true if the range is empty
*/
bool empty() const @property
{
return _empty;
}
/**
* Returns: the current token
*/
ref const(Token) front() const @property
{
assert(!empty, "trying to get front of an empty token range");
return current;
}
/**
* Returns the current token and then removes it from the range
*/
Token moveFront()
{
auto r = move(current);
popFront();
return r;
}
/**
* Removes the current token from the range
*/
void popFront()
{
advance();
}
private:
/*
* Advances the range to the next token
*/
void advance()
{
L_advance:
if (src.empty)
{
_empty = true;
return;
}
src.mark(); // mark a start of a lexing "frame"
current.line = lineNumber;
current.startIndex = src.index;
current.column = column;
current.value = null;
switch (src.front)
{
// handle sentenels for end of input
case 0:
case 0x1a:
// TODO: check config flags, it's cheap
// since this branch at most is taken once per file
_empty = true;
return;
mixin(generateCaseTrie(
"=", "TokenType.assign",
"@", "TokenType.at",
"&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEqual",
"|", "TokenType.bitOr",
"|=", "TokenType.bitOrEqual",
"~=", "TokenType.catEqual",
":", "TokenType.colon",
",", "TokenType.comma",
"--", "TokenType.decrement",
"$", "TokenType.dollar",
"==", "TokenType.equal",
"=>", "TokenType.goesTo",
">", "TokenType.greater",
">=", "TokenType.greaterEqual",
"++", "TokenType.increment",
"{", "TokenType.lBrace",
"[", "TokenType.lBracket",
"<", "TokenType.less",
"<=", "TokenType.lessEqual",
"<>=", "TokenType.lessEqualGreater",
"<>", "TokenType.lessOrGreater",
"&&", "TokenType.logicAnd",
"||", "TokenType.logicOr",
"(", "TokenType.lParen",
"-", "TokenType.minus",
"-=", "TokenType.minusEqual",
"%", "TokenType.mod",
"%=", "TokenType.modEqual",
"*=", "TokenType.mulEqual",
"!", "TokenType.not",
"!=", "TokenType.notEqual",
"!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus",
"+=", "TokenType.plusEqual",
"^^", "TokenType.pow",
"^^=", "TokenType.powEqual",
"}", "TokenType.rBrace",
"]", "TokenType.rBracket",
")", "TokenType.rParen",
";", "TokenType.semicolon",
"<<", "TokenType.shiftLeft",
"<<=", "TokenType.shiftLeftEqual",
">>", "TokenType.shiftRight",
">>=", "TokenType.shiftRightEqual",
"*", "TokenType.star",
"?", "TokenType.ternary",
"~", "TokenType.tilde",
"!<>=", "TokenType.unordered",
">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual",
"^", "TokenType.xor",
"^=", "TokenType.xorEqual"
));
case '/':
nextCharNonLF();
if (isEoF())
{
current.type = TokenType.div;
current.value = "/";
return;
}
switch (src.front)
{
case '/':
case '*':
case '+':
if (config.iterStyle & IterationStyle.includeComments)
return lexComment!true();
lexComment!false();
goto L_advance; // tail-recursion
case '=':
current.type = TokenType.divEqual;
current.value = "/=";
src.popFront();
return;
default:
current.type = TokenType.div;
current.value = "/";
return;
}
case '.':
if (!src.canPeek())
{
current.type = TokenType.dot;
current.value = tokenValue!(TokenType.dot);
return;
}
switch (src.peek())
{
case '0': .. case '9':
lexNumber();
return;
case '.':
nextCharNonLF();
nextCharNonLF();
current.type = TokenType.slice;
if (src.front == '.')
{
current.type = TokenType.vararg;
nextCharNonLF();
current.value = tokenValue!(TokenType.vararg);
}
else
current.value = tokenValue!(TokenType.slice);
return;
default:
nextCharNonLF();
current.type = TokenType.dot;
current.value = tokenValue!(TokenType.dot);
return;
}
case '0': .. case '9':
lexNumber();
return;
case '\'':
lexCharacterLiteral();
return;
case '"':
case '`':
lexString();
return;
case 'q':
nextCharNonLF();
if (isEoF())
goto default;
switch (src.front)
{
case '{':
lexTokenString();
return;
case '"':
lexDelimitedString();
return;
default:
break;
}
goto default;
case 'r':
nextCharNonLF();
if (isEoF())
goto default;
else if (src.front == '"')
{
lexString();
return;
}
else
goto default;
case 'x':
nextCharNonLF();
if (isEoF())
goto default;
else if (src.front == '"')
{
lexHexString();
return;
}
else
goto default;
case '#':
lexSpecialTokenSequence();
if(config.iterStyle & IterationStyle.includeSpecialTokens)
return;
goto L_advance; // tail-recursion
// "short" ASCII whites
case 0x20:
case 0x09: .. case 0x0d:
if (config.iterStyle & IterationStyle.includeWhitespace)
return lexWhitespace!true();
lexWhitespace!false();
goto L_advance; // tail-recursion
default:
if ((src.front & 0x80) && isLongWhite())
{
if (config.iterStyle & IterationStyle.includeWhitespace)
return lexWhitespace!true();
lexWhitespace!false();
goto L_advance; // tail-recursion
}
for(;;)
{
if(isSeparating())
break;
nextCharNonLF();
if(isEoF())
break;
}
current.type = lookupTokenType(src.slice);
current.value = getTokenValue(current.type);
if (current.value is null)
setTokenValue();
if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.specialEof)
{
_empty = true;
return;
}
if (config.tokenStyle & TokenStyle.doNotReplaceSpecial)
return;
expandSpecialToken();
}
}
// TODO: LexSource could be improved for forward ranges
// to avoid buffering at all (by disabling it for a moment)
// so keep the 'keep' parameter here and elsewhere
void lexWhitespace(bool keep)()
{
current.type = TokenType.whitespace;
do
{
nextChar();
} while (!isEoF() && isWhite());
static if (keep) setTokenValue();
}
void lexComment(bool keep)()
in
{
assert (src.front == '/' || src.front == '*' || src.front == '+');
}
body
{
current.type = TokenType.comment;
switch(src.front)
{
case '/':
while (!isEoF() && !isNewline(src.front))
{
nextCharNonLF();
}
break;
case '*':
while (!isEoF())
{
if (src.front == '*')
{
static if (keep) nextCharNonLF();
else src.popFront();
if (src.front == '/')
{
nextCharNonLF();
break;
}
}
else
nextChar();
}
break;
case '+':
int depth = 1;
while (depth > 0 && !isEoF())
{
if (src.front == '+')
{
nextCharNonLF();
if (src.front == '/')
{
nextCharNonLF();
--depth;
}
}
else if (src.front == '/')
{
nextCharNonLF();
if (src.front == '+')
{
nextCharNonLF();
++depth;
}
}
else
nextChar();
}
break;
default:
assert(false);
}
static if (keep)
setTokenValue();
}
void lexHexString()
in
{
assert (src.front == '"');
}
body
{
current.type = TokenType.stringLiteral;
nextChar();
while (true)
{
if (isEoF())
{
errorMessage("Unterminated hex string literal");
return;
}
else if (isHexDigit(src.front))
{
nextCharNonLF();
}
else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped))
{
nextChar();
}
else if (src.front == '"')
{
nextCharNonLF();
break;
}
else
{
errorMessage(format("Invalid character '%s' in hex string literal",
cast(char) src.front));
return;
}
}
bool hasSuffix = lexStringSuffix();
if (config.tokenStyle & TokenStyle.notEscaped)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(2, hasSuffix ? -2 : -1);
}
else
{
// TODO: appender is an allocation happy fat pig
// remove it later
auto a = appender!(char[])();
foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2))
{
auto s = cast(char[])b;
ubyte ch = cast(ubyte)parse!uint(s, 16);
a.put(ch);
}
// can safely assume ownership of data
current.value = cast(string)a.data;
}
}
void lexNumber()
in
{
assert(isDigit(src.front) || src.front == '.');
}
body
{
if (src.front != '0')
{
lexDecimal();
return;
}
else
{
switch (src.peek())
{
case 'x':
case 'X':
nextCharNonLF();
nextCharNonLF();
lexHex();
break;
case 'b':
case 'B':
nextCharNonLF();
nextCharNonLF();
lexBinary();
break;
default:
lexDecimal();
break;
}
}
}
void lexFloatSuffix()
{
switch (src.front)
{
case 'L':
nextCharNonLF();
current.type = TokenType.doubleLiteral;
break;
case 'f':
case 'F':
nextCharNonLF();
current.type = TokenType.floatLiteral;
break;
default:
break;
}
if (!isEoF() && src.front == 'i')
{
nextCharNonLF();
if (current.type == TokenType.floatLiteral)
current.type = TokenType.ifloatLiteral;
else
current.type = TokenType.idoubleLiteral;
}
}
void lexIntSuffix()
{
bool foundU;
bool foundL;
while (!isEoF())
{
switch (src.front)
{
case 'u':
case 'U':
if (foundU)
return;
switch (current.type)
{
case TokenType.intLiteral:
current.type = TokenType.uintLiteral;
nextCharNonLF();
break;
case TokenType.longLiteral:
current.type = TokenType.ulongLiteral;
nextCharNonLF();
break;
default:
assert (false);
}
foundU = true;
break;
case 'L':
if (foundL)
return;
switch (current.type)
{
case TokenType.intLiteral:
current.type = TokenType.longLiteral;
nextCharNonLF();
break;
case TokenType.uintLiteral:
current.type = TokenType.ulongLiteral;
nextCharNonLF();
break;
default:
assert (false);
}
foundL = true;
break;
default:
return;
}
}
}
void lexExponent()
in
{
assert (src.front == 'e' || src.front == 'E' || src.front == 'p'
|| src.front == 'P');
}
body
{
nextCharNonLF();
bool foundSign = false;
bool foundDigit = false;
while (!isEoF())
{
switch (src.front)
{
case '-':
case '+':
if (foundSign)
{
if (!foundDigit)
errorMessage("Expected an exponent");
return;
}
foundSign = true;
nextCharNonLF();
break;
case '0': .. case '9':
case '_':
foundDigit = true;
nextCharNonLF();
break;
case 'L':
case 'f':
case 'F':
case 'i':
lexFloatSuffix();
return;
default:
if (!foundDigit)
errorMessage("Expected an exponent");
return;
}
}
}
void lexDecimal()
in
{
assert (isDigit(src.front) || src.front == '.');
}
body
{
bool foundDot = src.front == '.';
if (foundDot)
nextCharNonLF();
current.type = TokenType.intLiteral;
decimalLoop: while (!isEoF())
{
switch (src.front)
{
case '0': .. case '9':
case '_':
nextCharNonLF();
break;
case 'u':
case 'U':
if (!foundDot)
lexIntSuffix();
break decimalLoop;
case 'i':
lexFloatSuffix();
break decimalLoop;
case 'L':
if (foundDot)
lexFloatSuffix();
else
lexIntSuffix();
break decimalLoop;
case 'f':
case 'F':
lexFloatSuffix();
break decimalLoop;
case 'e':
case 'E':
lexExponent();
break decimalLoop;
case '.':
if (foundDot)
break decimalLoop;
if (src.canPeek() && src.peek() == '.')
break decimalLoop;
else
{
// The following bit of silliness tries to tell the
// difference between "int dot identifier" and
// "double identifier".
if (src.canPeek())
{
switch (src.peek())
{
case 'u': case 'U': case 'i': case 'L': case 'f': case 'F':
case 'e': case 'E':
break decimalLoop;
default:
goto doubleLiteral;
}
}
else
{
doubleLiteral:
nextCharNonLF();
foundDot = true;
current.type = TokenType.doubleLiteral;
}
}
break;
default:
break decimalLoop;
}
}
setTokenValue();
}
void lexBinary()
{
current.type = TokenType.intLiteral;
binaryLoop: while (!isEoF())
{
switch (src.front)
{
case '0':
case '1':
case '_':
nextCharNonLF();
break;
case 'u':
case 'U':
case 'L':
lexIntSuffix();
break binaryLoop;
default:
break binaryLoop;
}
}
setTokenValue();
}
void lexHex()
{
current.type = TokenType.intLiteral;
bool foundDot;
hexLoop: while (!isEoF())
{
switch (src.front)
{
case 'a': .. case 'f':
case 'A': .. case 'F':
case '0': .. case '9':
case '_':
nextCharNonLF();
break;
case 'u':
case 'U':
lexIntSuffix();
break hexLoop;
case 'i':
if (foundDot)
lexFloatSuffix();
break hexLoop;
case 'L':
if (foundDot)
{
lexFloatSuffix();
break hexLoop;
}
else
{
lexIntSuffix();
break hexLoop;
}
case 'p':
case 'P':
lexExponent();
break hexLoop;
case '.':
if (foundDot)
break hexLoop;
if (src.canPeek() && src.peek() == '.')
break hexLoop;
nextCharNonLF();
foundDot = true;
current.type = TokenType.doubleLiteral;
break;
default:
break hexLoop;
}
}
setTokenValue();
}
bool lexStringSuffix()
{
current.type = TokenType.stringLiteral;
bool foundSuffix = false;
if (!isEoF())
{
switch (src.front)
{
case 'w':
current.type = TokenType.wstringLiteral;
goto case 'c';
case 'd':
current.type = TokenType.dstringLiteral;
goto case 'c';
case 'c':
foundSuffix = true;
nextCharNonLF();
break;
default:
break;
}
}
return foundSuffix;
}
void lexCharacterLiteral()
in
{
assert (src.front == '\'');
}
body
{
current.type = TokenType.characterLiteral;
nextChar();
if (isEoF())
{
errorMessage("Unterminated character literal");
return;
}
switch (src.front)
{
case '\'':
break;
case '\\':
if (config.tokenStyle & TokenStyle.notEscaped)
skipEscapeSequence();
else
{
// the only special path
// 40 bytes is enough for 2 quotes
// and the longest character entity
ubyte[40] utf8;
size_t len;
if (config.tokenStyle & TokenStyle.includeQuotes)
{
utf8[0] = '\'';
len = decodeEscapeSequence(utf8[1..$]);
utf8[len++] = '\'';
}
else
len = decodeEscapeSequence(utf8[]);
if (src.front != '\'')
{
errorMessage("Expected \"'\" to end character literal");
}
// skip over last "'"
nextChar();
setTokenValue(utf8[0..len]);
return;
}
break;
default:
if (src.front & 0x80)
{
while (src.front & 0x80)
nextChar();
break;
}
else
{
nextChar();
break;
}
}
if (src.front != '\'')
errorMessage("Expected \"'\" to end character literal");
nextChar();
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(1, -1);
}
void lexString()
in
{
//assert (src.front == '"');
}
body
{
current.type = TokenType.stringLiteral;
bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r"
bool isWysiwyg = src.front == '`';
// in case we need to unescape string
Appender!(ubyte[]) unescaped;
auto quote = src.front;
nextChar();
while (true)
{
if (isEoF())
{
errorMessage("Unterminated string literal");
return;
}
else if (src.front == '\\')
{
if (isWysiwyg || longWysiwg)
nextChar();
else if(config.tokenStyle & TokenStyle.notEscaped)
{
skipEscapeSequence();
}
else
{
if(unescaped == Appender!(ubyte[]).init)
unescaped = appender!(ubyte[])();
unescaped.put(src.slice());
decodeEscapeSequence(unescaped);
src.mark(); //start next slice after escape sequence
}
}
else if (src.front == quote)
{
nextCharNonLF();
break;
}
else
nextChar();
}
lexStringSuffix();
// helper to handle quotes
void setData(R)(R range)
{
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue(range);
else if (longWysiwg)
setTokenValue(range[2..$-1]);
else
setTokenValue(range[1..$-1]);
}
import std.stdio;
if(unescaped != Appender!(ubyte[]).init)
{
//stuff in the last slice and use buffered data
unescaped.put(src.slice);
setData(unescaped.data);
}
else
{
setData(src.slice); //slice directly
}
}
void lexDelimitedString()
in
{
assert(src.front == '"');
}
body
{
current.type = TokenType.stringLiteral;
nextChar();
bool heredoc;
ubyte open;
ubyte close;
switch (src.front)
{
case '[': open = '['; close = ']'; break;
case '{': open = '{'; close = '}'; break;
case '(': open = '('; close = ')'; break;
case '<': open = '<'; close = '>'; break;
default: heredoc = true; break;
}
if (heredoc)
lexHeredocString();
else
lexNormalDelimitedString(open, close);
}
void lexNormalDelimitedString(ubyte open, ubyte close)
in
{
assert(src.slice[0 .. 2] == `q"`);
}
body
{
current.type = TokenType.stringLiteral;
int depth = 1;
nextChar();
while (true)
{
if (isEoF())
{
errorMessage("Unterminated string literal");
break;
}
if (src.front == open)
{
nextChar();
++depth;
}
else if (src.front == close)
{
nextChar();
--depth;
if (depth <= 0)
{
auto r = src.save(); //TODO: allocates for Fwd range
if (r.front == '"')
{
nextChar();
break;
}
else
{
errorMessage("Expected \" after balanced "
~ cast(char) close ~ " but found "
~ cast(char) r.front ~ " instead.");
break;
}
}
}
else
nextChar();
}
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(3, -2);
}
void lexHeredocString()
in
{
assert (src.slice.equal("q\""));
}
body
{
typeof(src.slice) ident;
uint newlineBytes;
while (true)
{
if (isEoF())
{
errorMessage("Unterminated string literal");
return;
}
else if (isNewline(src.front))
{
ident = src.slice[2..$];
nextChar();
newlineBytes = cast(uint) (src.slice.length - 2 - ident.length);
break;
}
else if (isSeparating())
{
nextChar();
ident = src.slice[2..$];
nextChar();
newlineBytes = 0;
break;
}
else
{
nextChar();
}
}
while (true)
{
if (isEoF())
{
errorMessage("Unterminated string literal");
break;
}
else if (src.slice.length > ident.length
&& src.slice[$-ident.length .. $].equal(ident))
{
if (src.front == '"')
{
nextChar();
lexStringSuffix();
break;
}
else
{
errorMessage("Unterminated string literal: " ~ cast(string) src.slice);
break;
}
}
else
nextChar();
}
bool hasSuffix = lexStringSuffix();
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
{
setTokenValue(cast(int) (2 + newlineBytes + ident.length),
cast(int) (-(ident.length + (hasSuffix ? 2 : 1))));
}
}
void lexTokenString()
in
{
assert (src.front == '{');
}
body
{
current.type = TokenType.stringLiteral;
nextChar();
auto app = appender!(ubyte[])();
if (config.tokenStyle & TokenStyle.includeQuotes)
{
app.put('q');
app.put('{');
}
LexerConfig c = config;
scope (exit) config = c;
config.iterStyle = IterationStyle.everything;
config.tokenStyle = TokenStyle.source;
int depth = 1;
while (!isEoF())
{
advance();
if (current.type == TokenType.lBrace)
++depth;
else if (current.type == TokenType.rBrace)
{
--depth;
if (depth <= 0)
break;
}
app.put(representation(current.value));
}
config = c;
if (config.tokenStyle & TokenStyle.includeQuotes)
{
app.put('}');
}
if (src.empty)
current.type = TokenType.stringLiteral;
else
{
switch (src.front)
{
case 'd':
if (config.tokenStyle & TokenStyle.includeQuotes)
app.put('d');
current.type = TokenType.dstringLiteral;
src.popFront();
break;
case 'w':
if (config.tokenStyle & TokenStyle.includeQuotes)
app.put('w');
current.type = TokenType.wstringLiteral;
src.popFront();
break;
case 'c':
if (config.tokenStyle & TokenStyle.includeQuotes)
app.put('c');
src.popFront();
goto default;
default:
current.type = TokenType.stringLiteral;
break;
}
}
current.value = cast(string) app.data;
}
void lexSpecialTokenSequence()
in
{
assert (src.front == '#');
}
body
{
nextChar();
auto r = src.save();
auto app = appender!(ubyte[])();
app.put('#');
while (true)
{
if (r.isRangeEoF())
{
errorMessage("Found EOF when interpreting special token sequence");
return;
}
else if (isNewline(r.front))
break;
else
{
app.put(r.front);
r.popFront();
}
}
auto m = match((cast(char[]) app.data),
`#line\s+(?P