diff --git a/std/d/lexer.d b/std/d/lexer.d index df9e4e9..5ecef6d 100644 --- a/std/d/lexer.d +++ b/std/d/lexer.d @@ -1,110 +1,110 @@ // Written in the D programming language /** -* This module contains a range-based _lexer for the D programming language. -* -* For performance reasons the _lexer contained in this module operates only on -* ASCII and UTF-8 encoded source code. If the use of other encodings is -* desired, the source code must be converted to UTF-8 before passing it to this -* _lexer. -* -* To use the _lexer, create a LexerConfig struct -* --- -* LexerConfig config; -* config.iterStyle = IterationStyle.everything; -* config.tokenStyle = IterationStyle.source; -* config.versionNumber = 2061; -* config.vendorString = "Lexer Example"; -* --- -* Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your -* source code, passing in the configuration. -* --- -* auto source = "import std.stdio;"c; -* auto tokens = byToken(source, config); -* --- -* The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can -* be used easily with the algorithms from std.algorithm or iterated over with -* $(D_KEYWORD foreach) -* --- -* assert (tokens.front.type == TokenType.import_); -* assert (tokens.front.value == "import"); -* assert (tokens.front.line == 1); -* assert (tokens.front.startIndex == 0); -* --- -* -* Examples: -* -* Generate HTML markup of D code. -* --- -* module highlighter; -* -* import std.stdio; -* import std.array; -* import std.d.lexer; -* -* void writeSpan(string cssClass, string value) -* { -* stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); -* } -* -* -* // http://ethanschoonover.com/solarized -* void highlight(R)(R tokens) -* { -* stdout.writeln(q"[ -* -*
-* -* -* -* -*]");
-*
-* foreach (Token t; tokens)
-* {
-* if (isType(t.type))
-* writeSpan("type", t.value);
-* else if (isKeyword(t.type))
-* writeSpan("kwrd", t.value);
-* else if (t.type == TokenType.comment)
-* writeSpan("com", t.value);
-* else if (isStringLiteral(t.type))
-* writeSpan("str", t.value);
-* else if (isNumberLiteral(t.type))
-* writeSpan("num", t.value);
-* else if (isOperator(t.type))
-* writeSpan("op", t.value);
-* else
-* stdout.write(t.value.replace("<", "<"));
-* }
-* stdout.writeln("\n");
-* }
-*
-* void main(string[] args)
-* {
-* LexerConfig config;
-* config.tokenStyle = TokenStyle.source;
-* config.iterStyle = IterationStyle.everything;
-* config.fileName = args[1];
-* auto f = File(args[1]);
-* (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
-* }
-* ---
-*
-* Copyright: Brian Schott 2013
-* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
-* Authors: Brian Schott
-* Source: $(PHOBOSSRC std/d/_lexer.d)
-*/
+ * This module contains a range-based _lexer for the D programming language.
+ *
+ * For performance reasons the _lexer contained in this module operates only on
+ * ASCII and UTF-8 encoded source code. If the use of other encodings is
+ * desired, the source code must be converted to UTF-8 before passing it to this
+ * _lexer.
+ *
+ * To use the _lexer, create a LexerConfig struct
+ * ---
+ * LexerConfig config;
+ * config.iterStyle = IterationStyle.everything;
+ * config.tokenStyle = IterationStyle.source;
+ * config.versionNumber = 2061;
+ * config.vendorString = "Lexer Example";
+ * ---
+ * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your
+ * source code, passing in the configuration.
+ * ---
+ * auto source = "import std.stdio;"c;
+ * auto tokens = byToken(source, config);
+ * ---
+ * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can
+ * be used easily with the algorithms from std.algorithm or iterated over with
+ * $(D_KEYWORD foreach)
+ * ---
+ * assert (tokens.front.type == TokenType.import_);
+ * assert (tokens.front.value == "import");
+ * assert (tokens.front.line == 1);
+ * assert (tokens.front.startIndex == 0);
+ * ---
+ *
+ * Examples:
+ *
+ * Generate HTML markup of D code.
+ * ---
+ * module highlighter;
+ *
+ * import std.stdio;
+ * import std.array;
+ * import std.d.lexer;
+ *
+ * void writeSpan(string cssClass, string value)
+ * {
+ * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``);
+ * }
+ *
+ *
+ * // http://ethanschoonover.com/solarized
+ * void highlight(R)(R tokens)
+ * {
+ * stdout.writeln(q"[
+ *
+ *
+ *
+ *
+ *
+ *
+ * ]");
+ *
+ * foreach (Token t; tokens)
+ * {
+ * if (isType(t.type))
+ * writeSpan("type", t.value);
+ * else if (isKeyword(t.type))
+ * writeSpan("kwrd", t.value);
+ * else if (t.type == TokenType.comment)
+ * writeSpan("com", t.value);
+ * else if (isStringLiteral(t.type))
+ * writeSpan("str", t.value);
+ * else if (isNumberLiteral(t.type))
+ * writeSpan("num", t.value);
+ * else if (isOperator(t.type))
+ * writeSpan("op", t.value);
+ * else
+ * stdout.write(t.value.replace("<", "<"));
+ * }
+ * stdout.writeln("\n");
+ * }
+ *
+ * void main(string[] args)
+ * {
+ * LexerConfig config;
+ * config.tokenStyle = TokenStyle.source;
+ * config.iterStyle = IterationStyle.everything;
+ * config.fileName = args[1];
+ * auto f = File(args[1]);
+ * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
+ * }
+ * ---
+ *
+ * Copyright: Brian Schott 2013
+ * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
+ * Authors: Brian Schott, Dmitry Olshansky
+ * Source: $(PHOBOSSRC std/d/_lexer.d)
+ */
module std.d.lexer;
@@ -125,574 +125,287 @@ version (unittest) import std.stdio;
public:
/**
-* Represents a D token
-*/
+ * Represents a D token
+ */
struct Token
{
- /**
- * The token type.
- */
- TokenType type;
+ /**
+ * The token type.
+ */
+ TokenType type;
- /**
- * The representation of the token in the original source code.
- */
- string value;
+ /**
+ * The representation of the token in the original source code.
+ */
+ string value;
- /**
- * The number of the line the token is on.
- */
- uint line;
+ /**
+ * The number of the line the token is on.
+ */
+ uint line;
- /**
- * The column number of the start of the token in the original source.
- * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
- */
- uint column;
+ /**
+ * The column number of the start of the token in the original source.
+ * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
+ */
+ uint column;
- /**
- * The index of the start of the token in the original source.
- * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
- */
- size_t startIndex;
+ /**
+ * The index of the start of the token in the original source.
+ * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
+ */
+ size_t startIndex;
- /**
- * Check to see if the token is of the same type and has the same string
- * representation as the given token.
- */
- bool opEquals(ref const(Token) other) const
- {
- return other.type == type && other.value == value;
- }
+ /**
+ * Check to see if the token is of the same type and has the same string
+ * representation as the given token.
+ */
+ bool opEquals(ref const(Token) other) const
+ {
+ return other.type == type && other.value == value;
+ }
- /**
- * Checks to see if the token's string representation is equal to the given
- * string.
- */
- bool opEquals(string value) const { return this.value == value; }
+ /**
+ * Checks to see if the token's string representation is equal to the given
+ * string.
+ */
+ bool opEquals(string value) const { return this.value == value; }
- /**
- * Checks to see if the token is of the given type.
- */
- bool opEquals(TokenType type) const { return type == type; }
+ /**
+ * Checks to see if the token is of the given type.
+ */
+ bool opEquals(TokenType type) const { return type == type; }
- /**
- * Comparison operator orders tokens by start index.
- */
- int opCmp(ref const(Token) other) const
- {
- if (startIndex < other.startIndex) return -1;
- if (startIndex > other.startIndex) return 1;
- return 0;
- }
+ /**
+ * Comparison operator orders tokens by start index.
+ */
+ int opCmp(ref const(Token) other) const
+ {
+ if (startIndex < other.startIndex) return -1;
+ if (startIndex > other.startIndex) return 1;
+ return 0;
+ }
}
/**
-* Configure the behavior of the byToken() function. These flags may be
-* combined using a bitwise or.
-*/
+ * Configure the behavior of the byToken() function. These flags may be
+ * combined using a bitwise or.
+ */
enum IterationStyle
{
- /// Only include code, not whitespace or comments
- codeOnly = 0,
- /// Includes comments
- includeComments = 0b0001,
- /// Includes whitespace
- includeWhitespace = 0b0010,
- /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
- includeSpecialTokens = 0b0100,
- /// Do not stop iteration on reaching the ___EOF__ token
- ignoreEOF = 0b1000,
- /// Include everything
- everything = includeComments | includeWhitespace | ignoreEOF
+ /// Only include code, not whitespace or comments
+ codeOnly = 0,
+ /// Includes comments
+ includeComments = 0b0001,
+ /// Includes whitespace
+ includeWhitespace = 0b0010,
+ /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens)
+ includeSpecialTokens = 0b0100,
+ /// Do not stop iteration on reaching the ___EOF__ token
+ ignoreEOF = 0b1000,
+ /// Include _everything
+ everything = includeComments | includeWhitespace | ignoreEOF
}
/**
-* Configuration of the token lexing style. These flags may be combined with a
-* bitwise or.
-*/
+ * Configuration of the token lexing style. These flags may be combined with a
+ * bitwise or.
+ */
enum TokenStyle : uint
{
- /**
- * Escape sequences will be replaced with their equivalent characters,
- * enclosing quote characters will not be included. Special tokens such as
- * __VENDOR__ will be replaced with their equivalent strings. Useful for
- * creating a compiler or interpreter.
- */
- default_ = 0b0000,
+ /**
+ * Escape sequences will be replaced with their equivalent characters,
+ * enclosing quote characters will not be included. Special tokens such as
+ * __VENDOR__ will be replaced with their equivalent strings. Useful for
+ * creating a compiler or interpreter.
+ */
+ default_ = 0b0000,
- /**
- * Escape sequences will not be processed. An escaped quote character will
- * not terminate string lexing, but it will not be replaced with the quote
- * character in the token.
- */
- notEscaped = 0b0001,
+ /**
+ * Escape sequences will not be processed. An escaped quote character will
+ * not terminate string lexing, but it will not be replaced with the quote
+ * character in the token.
+ */
+ notEscaped = 0b0001,
- /**
- * Strings will include their opening and closing quote characters as well
- * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
- * include the $(D_STRING 'w') character as well as the opening and closing
- * quotes$(RPAREN)
- */
- includeQuotes = 0b0010,
+ /**
+ * Strings will include their opening and closing quote characters as well
+ * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
+ * include the $(D_STRING 'w') character as well as the opening and closing
+ * quotes$(RPAREN)
+ */
+ includeQuotes = 0b0010,
- /**
- * Do not replace the value field of the special tokens such as ___DATE__
- * with their string equivalents.
- */
- doNotReplaceSpecial = 0b0100,
+ /**
+ * Do not replace the value field of the special tokens such as ___DATE__
+ * with their string equivalents.
+ */
+ doNotReplaceSpecial = 0b0100,
- /**
- * Strings will be read exactly as they appeared in the source, including
- * their opening and closing quote characters. Useful for syntax
- * highlighting.
- */
- source = notEscaped | includeQuotes | doNotReplaceSpecial
+ /**
+ * Strings will be read exactly as they appeared in the source, including
+ * their opening and closing quote characters. Useful for syntax
+ * highlighting.
+ */
+ source = notEscaped | includeQuotes | doNotReplaceSpecial
}
/**
-* Lexer configuration
-*/
+ * Lexer configuration
+ */
struct LexerConfig
{
- /**
- * Iteration style
- */
- IterationStyle iterStyle = IterationStyle.codeOnly;
+ /**
+ * Iteration style
+ */
+ IterationStyle iterStyle = IterationStyle.codeOnly;
- /**
- * Token style
- */
- TokenStyle tokenStyle = tokenStyle.default_;
+ /**
+ * Token style
+ */
+ TokenStyle tokenStyle = tokenStyle.default_;
- /**
- * Replacement for the ___VERSION__ token. Defaults to 1.
- */
- uint versionNumber = 100;
+ /**
+ * Replacement for the ___VERSION__ token. Defaults to 100.
+ */
+ uint versionNumber = 100;
- /**
- * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer")
- */
- string vendorString = "std.d.lexer";
+ /**
+ * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer")
+ */
+ string vendorString = "std.d.lexer";
- /**
- * Name used when creating error messages that are sent to errorFunc. This
- * is needed because the lexer operates on any forwarad range of ASCII
- * characters or UTF-8 code units and does not know what to call its input
- * source. Defaults to the empty string.
- */
- string fileName = "";
+ /**
+ * Name used when creating error messages that are sent to errorFunc. This
+ * is needed because the lexer operates on any forwarad range of ASCII
+ * characters or UTF-8 code units and does not know what to call its input
+ * source. Defaults to the empty string.
+ */
+ string fileName = "";
- /**
- * This function is called when an error is encountered during lexing.
- * Parameters are file name, code uint index, line number, column,
- * and error messsage.
- */
- void delegate(string, size_t, uint, uint, string) errorFunc;
-
- /**
- * Initial size of the lexer's internal token buffer in bytes. The lexer
- * will grow this buffer if necessary.
- */
- size_t bufferSize = 1024 * 4;
+ /**
+ * This function is called when an error is encountered during lexing.
+ * Parameters are file name, code uint index, line number, column,
+ * and error messsage.
+ */
+ void delegate(string, size_t, uint, uint, string) errorFunc;
}
/**
-* Iterate over the given range of characters by D tokens.
-* Params:
-* range = the range of characters
-* config = the lexer configuration
-* bufferSize = initial size of internal circular buffer
-* Returns:
-* an input range of tokens
-*/
+ * Iterate over the given range of characters by D tokens.
+ * Params:
+ * range = the range of characters
+ * config = the lexer configuration
+ * bufferSize = initial size of internal circular buffer
+ * Returns:
+ * an input range of tokens
+ */
auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024)
- if (isForwardRange!(R) && !isRandomAccessRange!(R)
- && is(ElementType!R : const(ubyte)))
+ if (isForwardRange!(R) && !isRandomAccessRange!(R)
+ && is(ElementType!R : const(ubyte)))
{
- // 4K of circular buffer by default
- auto r = TokenRange!(typeof(lexerSource(range)))
- (lexerSource(range, bufferSize), config);
- r.config = config;
- r.lineNumber = 1;
- r.popFront();
- return r;
+ // 4K of circular buffer by default
+ auto r = TokenRange!(typeof(lexerSource(range)))
+ (lexerSource(range, bufferSize), config);
+ r.config = config;
+ r.lineNumber = 1;
+ r.popFront();
+ return r;
}
///ditto
auto byToken(R)(R range, LexerConfig config)
- if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte)))
+ if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte)))
{
- auto r = TokenRange!(typeof(lexerSource(range)))
- (lexerSource(range), config);
- r.config = config;
- r.lineNumber = 1;
- r.popFront();
- return r;
+ auto r = TokenRange!(typeof(lexerSource(range)))
+ (lexerSource(range), config);
+ r.config = config;
+ r.lineNumber = 1;
+ r.popFront();
+ return r;
}
-// For now a private helper that is tailored to the way lexer works
-// hides away forwardness of range by buffering
-// RA-version is strightforward thin wrapping
-// ATM it is byte-oriented
-private struct LexSource(R)
- if(isForwardRange!R && !isRandomAccessRange!R)
- {
- bool empty() const { return _empty; }
-
- auto ref front() const
- {
- return accum[accumIdx];
- }
-
- auto ref peek() const
- in
- {
- assert (accumIdx + 1 < accum.length);
- }
- body
- {
- return accum[accumIdx + 1];
- }
-
- void popFront()
- {
- ++_index;
- range.popFront();
- // if that was last byte
- // just advance so that open-righted slice just works
- accumIdx = (accumIdx+1) & mask;
- if(range.empty)
- {
- _empty = true;
- return;
- }
- if(accumIdx == savedAccumIdx)
- {
- // and move stuff around
- auto oldLen = accum.length;
- auto toCopy = oldLen - accumIdx;
- accum.length *= 2; // keep pow of 2
- // copy starting with last item
- copy(retro(accum[accumIdx..oldLen]),
- retro(accum[$-toCopy..$]));
- savedAccumIdx = accum.length - toCopy;
- }
- accum[accumIdx] = range.front;
- }
-
- auto save()
- {
- typeof(this) copy = this;
- copy.range = range.save;
- // sadly need to dup circular buffer, as it overwrites items
- copy.accum = copy.accum.dup;
- return copy;
- }
-
- // mark a position to slice from later on
- size_t mark()
- {
- savedAccumIdx = accumIdx;
- return accumIdx;
- }
-
- // slice to current position from previously marked position
- auto slice() @property
- {
- // it's an open right range as usual
- return CircularRange(accum, savedAccumIdx, accumIdx);
- }
-
- size_t index() const @property
- {
- return _index;
- }
-
-private:
- this(R src, size_t bufferSize)
- {
- range = src;
- assert(bufferSize > 0);
- assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
- accum = new ubyte[bufferSize];
- if(range.empty)
- _empty = true;
- else
- accum[accumIdx] = range.front; // load front
- }
-
- // a true RA-range of ubyte
- struct CircularRange
- {
- this(ubyte[] buf, size_t s, size_t e)
- {
- assert((buffer.length & (buffer.length-1)) == 0);
- buffer = buf;
- start = s;
- end = e;
- }
- //Forward range primitives
- @property bool empty() const { return start == end; }
- @property auto ref front() const { return buffer[start]; }
- void popFront() { start = (start + 1) & mask; }
- @property auto save() { return this; }
-
- //Backwards is a bit slower, but should be rarely used (if at all)
- @property ref back(){ return buffer[(end-1) & mask]; }
- void popBack() { end = (end - 1) & mask; }
-
- // RA range primitives
- ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
- @property size_t length()
- {
- return end < start ? end + buffer.length -start : end - start;
- }
- alias length opDollar;
-
- auto opSlice(size_t newStart, size_t newEnd)
- {
- size_t maskedStart = (start+newStart) & mask;
- size_t maskedEnd = (start+newEnd) & mask;
- return typeof(this)(buffer, maskedStart, maskedEnd);
- }
- // @@@bug fwd-ref in ldc0.10 (if placed above previous one)
- auto opSlice(){ return opSlice(0, length); }
- private:
- @property auto mask(){ return buffer.length-1; }
- size_t start, end;
- ubyte[] buffer;
- }
-
- @property auto mask(){ return accum.length-1; }
-
- R range;
- bool _empty;
- ubyte[] accum; // accumulator buffer for non-RA ranges
- size_t savedAccumIdx;
- size_t accumIdx; // current index in accumulator
- size_t _index; // index of current element in original range
-}
-
-// TODO: make sure it's RandomAccess later
-/*static assert(isRandomAccessRange!(
- LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
- .CircularRange)
-);*/
-
-//trivial pass-through for RA ranges
-private struct LexSource(R)
- if(isRandomAccessRange!R)
-{
- bool empty() const @property { return cur >= range.length; }
- bool canPeek() const { return cur + 1 < range.length; }
- auto ref front() const @property { return range[cur]; }
- void popFront(){ cur++; }
-
- auto ref peek() const
- in
- {
- assert (canPeek());
- }
- body
- {
- return range[cur + 1];
- }
-
- auto save()
- {
- typeof(this) copy = this;
- copy.range = range.save;
- return copy;
- }
-
- auto mark()
- {
- saved = cur;
- }
-
- // use the underliying range slicing capability
- auto slice() @property
- {
- return range[saved..cur];
- }
-
- size_t index() const @property
- {
- return cur;
- }
-
-private:
- this(R src)
- {
- range = src;
- }
- size_t cur, saved;
- R range;
-}
-
-auto lexerSource(Range)(Range range, size_t bufSize=8)
- if(isForwardRange!Range && !isRandomAccessRange!Range
- && is(ElementType!Range : const(ubyte)))
-{
- return LexSource!(Range)(range, bufSize);
-}
-
-auto lexerSource(Range)(Range range)
- if(isRandomAccessRange!Range
- && is(ElementType!Range : const(ubyte)))
-{
- return LexSource!(Range)(range);
-}
-
-unittest
-{
- // test the basic functionality of a "mark-slice" range
- import std.string, std.stdio;
-
- static void test_hello(T)(T lexs)
- {
- assert(lexs.front == 'H');
- lexs.popFront();
- assert(lexs.front == 'e');
- foreach(i; 0..2)
- {
- auto saved = lexs.save;
- lexs.mark();
- assert(lexs.slice.equal(""));
- lexs.popFront();
- assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
- lexs.popFrontN(4);
- auto bytes = lexs.slice.map!"cast(char)a".array();
- assert(bytes.equal("ello,"), bytes.to!string);
- lexs.mark();
- assert(lexs.slice.equal(""));
- assert(lexs.front == 'w');
- lexs.popFrontN(6);
- assert(lexs.empty);
- auto s = lexs.slice();
- auto msg = s.save.map!"cast(char)a".array;
- assert(s[].equal("world!"), msg);
- assert(s[2..$-1].equal("rld"), msg);
- assert(s[0] == 'w' && s[$-1] == '!');
- s.popFront();
- assert(s.front == 'o' && s.back == '!');
- s.popBack();
- assert(s.front == 'o' && s.back == 'd');
- //restore and repeat again
- lexs = saved;
- }
- }
-
- static void test_empty(T)(T lexs)
- {
- assert(lexs.empty);
- lexs.mark();
- assert(lexs.slice().equal(""));
- }
-
- auto fwdLex = lexerSource(
- "Hello, world!"
- .representation
- .filter!"a != ' '", 16 // and the one that is more then enough
- );
- test_hello(fwdLex);
- fwdLex = lexerSource(
- "Hello, world!"
- .representation
- .filter!"a != ' '", 1 // try the smallest initial buffer
- );
- test_hello(fwdLex);
- fwdLex = lexerSource("".representation.filter!"a != ' '");
- auto raLex = lexerSource("".representation);
- test_empty(raLex);
- test_empty(fwdLex);
- raLex = lexerSource("Hello,world!".representation);
- test_hello(raLex);
-}
-
-
/**
-* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
-*/
+ * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
+ */
struct TokenRange(LexSrc)
- //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource
+ //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource
{
- /**
- * Returns: true if the range is empty
- */
- bool empty() const @property
- {
- return _empty;
- }
+ /**
+ * Returns: true if the range is empty
+ */
+ bool empty() const @property
+ {
+ return _empty;
+ }
- /**
- * Returns: the current token
- */
- ref const(Token) front() const @property
- {
- assert(!empty, "trying to get front of an empty token range");
- return current;
- }
+ /**
+ * Returns: the current token
+ */
+ ref const(Token) front() const @property
+ {
+ assert(!empty, "trying to get front of an empty token range");
+ return current;
+ }
- /**
- * Returns the current token and then removes it from the range
- */
- Token moveFront()
- {
- auto r = move(current);
- popFront();
- return r;
- }
+ /**
+ * Returns the current token and then removes it from the range
+ */
+ Token moveFront()
+ {
+ auto r = move(current);
+ popFront();
+ return r;
+ }
- /**
- * Foreach operation
- */
- int opApply(int delegate(Token) dg)
- {
- int result = 0;
- while (!empty)
- {
- result = dg(front);
- if (result)
- break;
- popFront();
- }
- return result;
- }
+ /**
+ * Foreach operation
+ */
+ int opApply(int delegate(Token) dg)
+ {
+ int result = 0;
+ while (!empty)
+ {
+ result = dg(front);
+ if (result)
+ break;
+ popFront();
+ }
+ return result;
+ }
- /**
- * Foreach operation
- */
- int opApply(int delegate(size_t, Token) dg)
- {
- int result = 0;
- int i = 0;
- while (!empty)
- {
- result = dg(i, front);
- if (result)
- break;
- popFront();
- }
- return result;
- }
+ /**
+ * Foreach operation
+ */
+ int opApply(int delegate(size_t, Token) dg)
+ {
+ int result = 0;
+ int i = 0;
+ while (!empty)
+ {
+ result = dg(i, front);
+ if (result)
+ break;
+ popFront();
+ }
+ return result;
+ }
- /**
- * Removes the current token from the range
- */
- void popFront()
- {
+ /**
+ * Removes the current token from the range
+ */
+ void popFront()
+ {
advance();
- }
+ }
private:
- /*
- * Advances the range to the next token
- */
- void advance()
- {
+ /*
+ * Advances the range to the next token
+ */
+ void advance()
+ {
L_advance:
if (src.empty)
{
@@ -700,185 +413,185 @@ L_advance:
return;
}
src.mark(); // mark a start of a lexing "frame"
- current.line = lineNumber;
- current.startIndex = src.index;
- current.column = column;
- current.value = null;
- switch (src.front)
- {
+ current.line = lineNumber;
+ current.startIndex = src.index;
+ current.column = column;
+ current.value = null;
+ switch (src.front)
+ {
// handle sentenels for end of input
- case 0:
+ case 0:
case 0x1a:
- // TODO: check config flags, it's cheap
- // since this branch at most is taken once per file
+ // TODO: check config flags, it's cheap
+ // since this branch at most is taken once per file
_empty = true;
- return;
+ return;
// pragma(msg, generateCaseTrie(
- mixin(generateCaseTrie(
- "=", "TokenType.assign",
- "@", "TokenType.at",
- "&", "TokenType.bitAnd",
- "&=", "TokenType.bitAndEquals",
- "|", "TokenType.bitOr",
- "|=", "TokenType.bitOrEquals",
- "~=", "TokenType.catEquals",
- ":", "TokenType.colon",
- ",", "TokenType.comma",
- "--", "TokenType.decrement",
- "$", "TokenType.dollar",
- "==", "TokenType.equals",
- "=>", "TokenType.goesTo",
- ">", "TokenType.greater",
- ">=", "TokenType.greaterEqual",
- "++", "TokenType.increment",
- "{", "TokenType.lBrace",
- "[", "TokenType.lBracket",
- "<", "TokenType.less",
- "<=", "TokenType.lessEqual",
- "<>=", "TokenType.lessEqualGreater",
- "<>", "TokenType.lessOrGreater",
- "&&", "TokenType.logicAnd",
- "||", "TokenType.logicOr",
- "(", "TokenType.lParen",
- "-", "TokenType.minus",
- "-=", "TokenType.minusEquals",
- "%", "TokenType.mod",
- "%=", "TokenType.modEquals",
- "*=", "TokenType.mulEquals",
- "!", "TokenType.not",
- "!=", "TokenType.notEquals",
- "!>", "TokenType.notGreater",
- "!>=", "TokenType.notGreaterEqual",
- "!<", "TokenType.notLess",
- "!<=", "TokenType.notLessEqual",
- "!<>", "TokenType.notLessEqualGreater",
- "+", "TokenType.plus",
- "+=", "TokenType.plusEquals",
- "^^", "TokenType.pow",
- "^^=", "TokenType.powEquals",
- "}", "TokenType.rBrace",
- "]", "TokenType.rBracket",
- ")", "TokenType.rParen",
- ";", "TokenType.semicolon",
- "<<", "TokenType.shiftLeft",
- "<<=", "TokenType.shiftLeftEqual",
- ">>", "TokenType.shiftRight",
- ">>=", "TokenType.shiftRightEqual",
- "*", "TokenType.star",
- "?", "TokenType.ternary",
- "~", "TokenType.tilde",
- "!<>=", "TokenType.unordered",
- ">>>", "TokenType.unsignedShiftRight",
- ">>>=", "TokenType.unsignedShiftRightEqual",
- "^", "TokenType.xor",
- "^=", "TokenType.xorEquals",
- ));
- case '/':
- nextCharNonLF();
- if (isEoF())
- {
- current.type = TokenType.div;
- current.value = "/";
- return;
- }
- switch (src.front)
- {
- case '/':
- case '*':
- case '+':
- if (config.iterStyle & IterationStyle.includeComments)
- return lexComment!true();
+ mixin(generateCaseTrie(
+ "=", "TokenType.assign",
+ "@", "TokenType.at",
+ "&", "TokenType.bitAnd",
+ "&=", "TokenType.bitAndEquals",
+ "|", "TokenType.bitOr",
+ "|=", "TokenType.bitOrEquals",
+ "~=", "TokenType.catEquals",
+ ":", "TokenType.colon",
+ ",", "TokenType.comma",
+ "--", "TokenType.decrement",
+ "$", "TokenType.dollar",
+ "==", "TokenType.equals",
+ "=>", "TokenType.goesTo",
+ ">", "TokenType.greater",
+ ">=", "TokenType.greaterEqual",
+ "++", "TokenType.increment",
+ "{", "TokenType.lBrace",
+ "[", "TokenType.lBracket",
+ "<", "TokenType.less",
+ "<=", "TokenType.lessEqual",
+ "<>=", "TokenType.lessEqualGreater",
+ "<>", "TokenType.lessOrGreater",
+ "&&", "TokenType.logicAnd",
+ "||", "TokenType.logicOr",
+ "(", "TokenType.lParen",
+ "-", "TokenType.minus",
+ "-=", "TokenType.minusEquals",
+ "%", "TokenType.mod",
+ "%=", "TokenType.modEquals",
+ "*=", "TokenType.mulEquals",
+ "!", "TokenType.not",
+ "!=", "TokenType.notEquals",
+ "!>", "TokenType.notGreater",
+ "!>=", "TokenType.notGreaterEqual",
+ "!<", "TokenType.notLess",
+ "!<=", "TokenType.notLessEqual",
+ "!<>", "TokenType.notLessEqualGreater",
+ "+", "TokenType.plus",
+ "+=", "TokenType.plusEquals",
+ "^^", "TokenType.pow",
+ "^^=", "TokenType.powEquals",
+ "}", "TokenType.rBrace",
+ "]", "TokenType.rBracket",
+ ")", "TokenType.rParen",
+ ";", "TokenType.semicolon",
+ "<<", "TokenType.shiftLeft",
+ "<<=", "TokenType.shiftLeftEqual",
+ ">>", "TokenType.shiftRight",
+ ">>=", "TokenType.shiftRightEqual",
+ "*", "TokenType.star",
+ "?", "TokenType.ternary",
+ "~", "TokenType.tilde",
+ "!<>=", "TokenType.unordered",
+ ">>>", "TokenType.unsignedShiftRight",
+ ">>>=", "TokenType.unsignedShiftRightEqual",
+ "^", "TokenType.xor",
+ "^=", "TokenType.xorEquals",
+ ));
+ case '/':
+ nextCharNonLF();
+ if (isEoF())
+ {
+ current.type = TokenType.div;
+ current.value = "/";
+ return;
+ }
+ switch (src.front)
+ {
+ case '/':
+ case '*':
+ case '+':
+ if (config.iterStyle & IterationStyle.includeComments)
+ return lexComment!true();
lexComment!false();
goto L_advance; // tail-recursion
-
- case '=':
- current.type = TokenType.divEquals;
- current.value = "/=";
- src.popFront();
- return;
- default:
- current.type = TokenType.div;
- current.value = "/";
- return;
- }
- case '.':
- if (!src.canPeek())
- {
- current.type = TokenType.dot;
- current.value = getTokenValue(TokenType.dot);
- return;
- }
- switch (src.peek())
- {
- case '0': .. case '9':
- lexNumber();
- return;
- case '.':
- nextCharNonLF();
- nextCharNonLF();
- current.type = TokenType.slice;
- if (src.front == '.')
- {
- current.type = TokenType.vararg;
- nextCharNonLF();
- }
- current.value = getTokenValue(current.type);
- return;
- default:
- nextCharNonLF();
- current.type = TokenType.dot;
- current.value = getTokenValue(TokenType.dot);
- return;
- }
- case '0': .. case '9':
- lexNumber();
- return;
- case '\'':
- lexCharacterLiteral();
- return;
- case '"':
- case '`':
- lexString();
- return;
- case 'q':
- nextCharNonLF();
- if (isEoF())
- goto default;
- switch (src.front)
- {
- case '{':
- lexTokenString();
- return;
- case '"':
- lexDelimitedString();
- return;
- default:
- break;
- }
- goto default;
- case 'r':
- nextCharNonLF();
- if (isEoF())
- goto default;
- else if (src.front == '"')
- {
- lexString();
- return;
- }
- else
- goto default;
- case 'x':
- nextCharNonLF();
- if (isEoF())
- goto default;
- else if (src.front == '"')
- {
- lexHexString();
- return;
- }
- else
- goto default;
- case '#':
+
+ case '=':
+ current.type = TokenType.divEquals;
+ current.value = "/=";
+ src.popFront();
+ return;
+ default:
+ current.type = TokenType.div;
+ current.value = "/";
+ return;
+ }
+ case '.':
+ if (!src.canPeek())
+ {
+ current.type = TokenType.dot;
+ current.value = getTokenValue(TokenType.dot);
+ return;
+ }
+ switch (src.peek())
+ {
+ case '0': .. case '9':
+ lexNumber();
+ return;
+ case '.':
+ nextCharNonLF();
+ nextCharNonLF();
+ current.type = TokenType.slice;
+ if (src.front == '.')
+ {
+ current.type = TokenType.vararg;
+ nextCharNonLF();
+ }
+ current.value = getTokenValue(current.type);
+ return;
+ default:
+ nextCharNonLF();
+ current.type = TokenType.dot;
+ current.value = getTokenValue(TokenType.dot);
+ return;
+ }
+ case '0': .. case '9':
+ lexNumber();
+ return;
+ case '\'':
+ lexCharacterLiteral();
+ return;
+ case '"':
+ case '`':
+ lexString();
+ return;
+ case 'q':
+ nextCharNonLF();
+ if (isEoF())
+ goto default;
+ switch (src.front)
+ {
+ case '{':
+ lexTokenString();
+ return;
+ case '"':
+ lexDelimitedString();
+ return;
+ default:
+ break;
+ }
+ goto default;
+ case 'r':
+ nextCharNonLF();
+ if (isEoF())
+ goto default;
+ else if (src.front == '"')
+ {
+ lexString();
+ return;
+ }
+ else
+ goto default;
+ case 'x':
+ nextCharNonLF();
+ if (isEoF())
+ goto default;
+ else if (src.front == '"')
+ {
+ lexHexString();
+ return;
+ }
+ else
+ goto default;
+ case '#':
lexSpecialTokenSequence();
if(config.iterStyle & IterationStyle.includeSpecialTokens)
return;
@@ -889,1189 +602,1189 @@ L_advance:
if (config.iterStyle & IterationStyle.includeWhitespace)
return lexWhitespace!true();
lexWhitespace!false();
- goto L_advance; // tail-recursion
- default:
+ goto L_advance; // tail-recursion
+ default:
if ((src.front & 0x80) && isLongWhite())
- {
+ {
if (config.iterStyle & IterationStyle.includeWhitespace)
return lexWhitespace!true();
lexWhitespace!false();
goto L_advance; // tail-recursion
}
- for(;;)
- {
+ for(;;)
+ {
if(isSeparating())
break;
- nextCharNonLF();
+ nextCharNonLF();
if(isEoF())
break;
- }
+ }
- current.type = lookupTokenType(src.slice);
- current.value = getTokenValue(current.type);
- if (current.value is null)
- setTokenValue();
+ current.type = lookupTokenType(src.slice);
+ current.value = getTokenValue(current.type);
+ if (current.value is null)
+ setTokenValue();
- if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof)
- {
- _empty = true;
- return;
- }
+ if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof)
+ {
+ _empty = true;
+ return;
+ }
- if (config.iterStyle & TokenStyle.doNotReplaceSpecial)
- return;
+ if (config.iterStyle & TokenStyle.doNotReplaceSpecial)
+ return;
expandSpecialToken();
- }
- }
+ }
+ }
- // TODO: LexSource could be improved for forward ranges
- // to avoid buffering at all (by disabling it for a moment)
- // so keep the 'keep' parameter here and elsewhere
- void lexWhitespace(bool keep)()
- {
- current.type = TokenType.whitespace;
- do
- {
- nextChar();
- }while (!isEoF() && isWhite());
- static if (keep) setTokenValue();
- }
+ // TODO: LexSource could be improved for forward ranges
+ // to avoid buffering at all (by disabling it for a moment)
+ // so keep the 'keep' parameter here and elsewhere
+ void lexWhitespace(bool keep)()
+ {
+ current.type = TokenType.whitespace;
+ do
+ {
+ nextChar();
+ } while (!isEoF() && isWhite());
+ static if (keep) setTokenValue();
+ }
- void lexComment(bool keep)()
- in
- {
- assert (src.front == '/' || src.front == '*' || src.front == '+');
- }
- body
- {
- current.type = TokenType.comment;
- switch(src.front)
- {
- case '/':
- while (!isEoF() && !isNewline(src.front))
- {
- nextCharNonLF();
- }
- break;
- case '*':
- while (!isEoF())
- {
- if (src.front == '*')
- {
- static if (keep) nextCharNonLF();
- else src.popFront();
- if (src.front == '/')
- {
- nextCharNonLF();
- break;
- }
- }
- else
- nextChar();
- }
- break;
- case '+':
- int depth = 1;
- while (depth > 0 && !isEoF())
- {
- if (src.front == '+')
- {
- nextCharNonLF();
- if (src.front == '/')
- {
- nextCharNonLF();
- --depth;
- }
- }
- else if (src.front == '/')
- {
- nextCharNonLF();
- if (src.front == '+')
- {
- nextCharNonLF();
- ++depth;
- }
- }
- else
- nextChar();
- }
- break;
- default:
- assert(false);
- }
- static if (keep)
- setTokenValue();
- }
+ void lexComment(bool keep)()
+ in
+ {
+ assert (src.front == '/' || src.front == '*' || src.front == '+');
+ }
+ body
+ {
+ current.type = TokenType.comment;
+ switch(src.front)
+ {
+ case '/':
+ while (!isEoF() && !isNewline(src.front))
+ {
+ nextCharNonLF();
+ }
+ break;
+ case '*':
+ while (!isEoF())
+ {
+ if (src.front == '*')
+ {
+ static if (keep) nextCharNonLF();
+ else src.popFront();
+ if (src.front == '/')
+ {
+ nextCharNonLF();
+ break;
+ }
+ }
+ else
+ nextChar();
+ }
+ break;
+ case '+':
+ int depth = 1;
+ while (depth > 0 && !isEoF())
+ {
+ if (src.front == '+')
+ {
+ nextCharNonLF();
+ if (src.front == '/')
+ {
+ nextCharNonLF();
+ --depth;
+ }
+ }
+ else if (src.front == '/')
+ {
+ nextCharNonLF();
+ if (src.front == '+')
+ {
+ nextCharNonLF();
+ ++depth;
+ }
+ }
+ else
+ nextChar();
+ }
+ break;
+ default:
+ assert(false);
+ }
+ static if (keep)
+ setTokenValue();
+ }
- void lexHexString()
- in
- {
- assert (src.front == '"');
- }
- body
- {
- current.type = TokenType.stringLiteral;
- nextChar();
- while (true)
- {
- if (isEoF())
- {
- errorMessage("Unterminated hex string literal");
- return;
- }
- else if (isHexDigit(src.front))
- {
- nextCharNonLF();
- }
- else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped))
- {
- nextChar();
- }
- else if (src.front == '"')
- {
- nextCharNonLF();
- break;
- }
- else
- {
- errorMessage(format("Invalid character '%s' in hex string literal",
- cast(char) src.front));
- return;
- }
- }
- bool hasSuffix = lexStringSuffix();
- if (config.tokenStyle & TokenStyle.notEscaped)
- {
- if (config.tokenStyle & TokenStyle.includeQuotes)
- setTokenValue();
- else
- setTokenValue(2, hasSuffix ? -2 : -1);
- }
- else
- {
- // TODO: appender is an allocation happy fat pig
- // remove it later
- auto a = appender!(char[])();
- foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2))
- {
- auto s = cast(char[])b;
- ubyte ch = cast(ubyte)parse!uint(s, 16);
- a.put(ch);
- }
- // can safely assume ownership of data
- current.value = cast(string)a.data;
- }
- }
+ void lexHexString()
+ in
+ {
+ assert (src.front == '"');
+ }
+ body
+ {
+ current.type = TokenType.stringLiteral;
+ nextChar();
+ while (true)
+ {
+ if (isEoF())
+ {
+ errorMessage("Unterminated hex string literal");
+ return;
+ }
+ else if (isHexDigit(src.front))
+ {
+ nextCharNonLF();
+ }
+ else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped))
+ {
+ nextChar();
+ }
+ else if (src.front == '"')
+ {
+ nextCharNonLF();
+ break;
+ }
+ else
+ {
+ errorMessage(format("Invalid character '%s' in hex string literal",
+ cast(char) src.front));
+ return;
+ }
+ }
+ bool hasSuffix = lexStringSuffix();
+ if (config.tokenStyle & TokenStyle.notEscaped)
+ {
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ setTokenValue();
+ else
+ setTokenValue(2, hasSuffix ? -2 : -1);
+ }
+ else
+ {
+ // TODO: appender is an allocation happy fat pig
+ // remove it later
+ auto a = appender!(char[])();
+ foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2))
+ {
+ auto s = cast(char[])b;
+ ubyte ch = cast(ubyte)parse!uint(s, 16);
+ a.put(ch);
+ }
+ // can safely assume ownership of data
+ current.value = cast(string)a.data;
+ }
+ }
- void lexNumber()
- in
- {
- assert(isDigit(src.front) || src.front == '.');
- }
- body
- {
- if (src.front != '0')
- {
- lexDecimal();
- return;
- }
- else
- {
- switch (src.peek())
- {
- case 'x':
- case 'X':
- nextCharNonLF();
- nextCharNonLF();
- lexHex();
- break;
- case 'b':
- case 'B':
- nextCharNonLF();
- nextCharNonLF();
- lexBinary();
- break;
- default:
- lexDecimal();
- break;
- }
- }
- }
+ void lexNumber()
+ in
+ {
+ assert(isDigit(src.front) || src.front == '.');
+ }
+ body
+ {
+ if (src.front != '0')
+ {
+ lexDecimal();
+ return;
+ }
+ else
+ {
+ switch (src.peek())
+ {
+ case 'x':
+ case 'X':
+ nextCharNonLF();
+ nextCharNonLF();
+ lexHex();
+ break;
+ case 'b':
+ case 'B':
+ nextCharNonLF();
+ nextCharNonLF();
+ lexBinary();
+ break;
+ default:
+ lexDecimal();
+ break;
+ }
+ }
+ }
- void lexFloatSuffix()
- {
- switch (src.front)
- {
- case 'L':
- nextCharNonLF();
- current.type = TokenType.doubleLiteral;
- break;
- case 'f':
- case 'F':
- nextCharNonLF();
- current.type = TokenType.floatLiteral;
- break;
- default:
- break;
- }
- if (!isEoF() && src.front == 'i')
- {
- nextCharNonLF();
- if (current.type == TokenType.floatLiteral)
- current.type = TokenType.ifloatLiteral;
- else
- current.type = TokenType.idoubleLiteral;
- }
- }
+ void lexFloatSuffix()
+ {
+ switch (src.front)
+ {
+ case 'L':
+ nextCharNonLF();
+ current.type = TokenType.doubleLiteral;
+ break;
+ case 'f':
+ case 'F':
+ nextCharNonLF();
+ current.type = TokenType.floatLiteral;
+ break;
+ default:
+ break;
+ }
+ if (!isEoF() && src.front == 'i')
+ {
+ nextCharNonLF();
+ if (current.type == TokenType.floatLiteral)
+ current.type = TokenType.ifloatLiteral;
+ else
+ current.type = TokenType.idoubleLiteral;
+ }
+ }
- void lexIntSuffix()
- {
- bool foundU;
- bool foundL;
- while (!isEoF())
- {
- switch (src.front)
- {
- case 'u':
- case 'U':
- if (foundU)
- return;
- switch (current.type)
- {
- case TokenType.intLiteral:
- current.type = TokenType.uintLiteral;
- nextCharNonLF();
- break;
- case TokenType.longLiteral:
- current.type = TokenType.ulongLiteral;
- nextCharNonLF();
- break;
- default:
- assert (false);
- }
- foundU = true;
- break;
- case 'L':
- if (foundL)
- return;
- switch (current.type)
- {
- case TokenType.intLiteral:
- current.type = TokenType.longLiteral;
- nextCharNonLF();
- break;
- case TokenType.uintLiteral:
- current.type = TokenType.ulongLiteral;
- nextCharNonLF();
- break;
- default:
- assert (false);
- }
- foundL = true;
- break;
- default:
- return;
- }
- }
- }
+ void lexIntSuffix()
+ {
+ bool foundU;
+ bool foundL;
+ while (!isEoF())
+ {
+ switch (src.front)
+ {
+ case 'u':
+ case 'U':
+ if (foundU)
+ return;
+ switch (current.type)
+ {
+ case TokenType.intLiteral:
+ current.type = TokenType.uintLiteral;
+ nextCharNonLF();
+ break;
+ case TokenType.longLiteral:
+ current.type = TokenType.ulongLiteral;
+ nextCharNonLF();
+ break;
+ default:
+ assert (false);
+ }
+ foundU = true;
+ break;
+ case 'L':
+ if (foundL)
+ return;
+ switch (current.type)
+ {
+ case TokenType.intLiteral:
+ current.type = TokenType.longLiteral;
+ nextCharNonLF();
+ break;
+ case TokenType.uintLiteral:
+ current.type = TokenType.ulongLiteral;
+ nextCharNonLF();
+ break;
+ default:
+ assert (false);
+ }
+ foundL = true;
+ break;
+ default:
+ return;
+ }
+ }
+ }
- void lexExponent()
- in
- {
- assert (src.front == 'e' || src.front == 'E' || src.front == 'p'
- || src.front == 'P');
- }
- body
- {
- nextCharNonLF();
- bool foundSign = false;
- bool foundDigit = false;
- while (!isEoF())
- {
- switch (src.front)
- {
- case '-':
- case '+':
- if (foundSign)
- {
- if (!foundDigit)
- errorMessage("Expected an exponent");
- return;
- }
- foundSign = true;
- nextCharNonLF();
- break;
- case '0': .. case '9':
- case '_':
- foundDigit = true;
- nextCharNonLF();
- break;
- case 'L':
- case 'f':
- case 'F':
- case 'i':
- lexFloatSuffix();
- return;
- default:
- if (!foundDigit)
- errorMessage("Expected an exponent");
- return;
- }
- }
- }
+ void lexExponent()
+ in
+ {
+ assert (src.front == 'e' || src.front == 'E' || src.front == 'p'
+ || src.front == 'P');
+ }
+ body
+ {
+ nextCharNonLF();
+ bool foundSign = false;
+ bool foundDigit = false;
+ while (!isEoF())
+ {
+ switch (src.front)
+ {
+ case '-':
+ case '+':
+ if (foundSign)
+ {
+ if (!foundDigit)
+ errorMessage("Expected an exponent");
+ return;
+ }
+ foundSign = true;
+ nextCharNonLF();
+ break;
+ case '0': .. case '9':
+ case '_':
+ foundDigit = true;
+ nextCharNonLF();
+ break;
+ case 'L':
+ case 'f':
+ case 'F':
+ case 'i':
+ lexFloatSuffix();
+ return;
+ default:
+ if (!foundDigit)
+ errorMessage("Expected an exponent");
+ return;
+ }
+ }
+ }
- void lexDecimal()
- in
- {
- assert (isDigit(src.front) || src.front == '.');
- }
- body
- {
- bool foundDot = src.front == '.';
- if (foundDot)
- nextCharNonLF();
- current.type = TokenType.intLiteral;
- decimalLoop: while (!isEoF())
- {
- switch (src.front)
- {
- case '0': .. case '9':
- case '_':
- nextCharNonLF();
- break;
- case 'u':
- case 'U':
- if (!foundDot)
- lexIntSuffix();
- break decimalLoop;
- case 'i':
- lexFloatSuffix();
- break decimalLoop;
- case 'L':
- if (foundDot)
- lexFloatSuffix();
- else
- lexIntSuffix();
- break decimalLoop;
- case 'f':
- case 'F':
- lexFloatSuffix();
- break decimalLoop;
- case 'e':
- case 'E':
- lexExponent();
- break decimalLoop;
- case '.':
- if (foundDot)
- break decimalLoop;
- if (src.canPeek() && src.peek() == '.')
- break decimalLoop;
- nextCharNonLF();
- foundDot = true;
- current.type = TokenType.doubleLiteral;
- break;
- default:
- break decimalLoop;
- }
- }
- setTokenValue();
- }
+ void lexDecimal()
+ in
+ {
+ assert (isDigit(src.front) || src.front == '.');
+ }
+ body
+ {
+ bool foundDot = src.front == '.';
+ if (foundDot)
+ nextCharNonLF();
+ current.type = TokenType.intLiteral;
+ decimalLoop: while (!isEoF())
+ {
+ switch (src.front)
+ {
+ case '0': .. case '9':
+ case '_':
+ nextCharNonLF();
+ break;
+ case 'u':
+ case 'U':
+ if (!foundDot)
+ lexIntSuffix();
+ break decimalLoop;
+ case 'i':
+ lexFloatSuffix();
+ break decimalLoop;
+ case 'L':
+ if (foundDot)
+ lexFloatSuffix();
+ else
+ lexIntSuffix();
+ break decimalLoop;
+ case 'f':
+ case 'F':
+ lexFloatSuffix();
+ break decimalLoop;
+ case 'e':
+ case 'E':
+ lexExponent();
+ break decimalLoop;
+ case '.':
+ if (foundDot)
+ break decimalLoop;
+ if (src.canPeek() && src.peek() == '.')
+ break decimalLoop;
+ nextCharNonLF();
+ foundDot = true;
+ current.type = TokenType.doubleLiteral;
+ break;
+ default:
+ break decimalLoop;
+ }
+ }
+ setTokenValue();
+ }
- void lexBinary()
- {
- current.type = TokenType.intLiteral;
- binaryLoop: while (!isEoF())
- {
- switch (src.front)
- {
- case '0':
- case '1':
- case '_':
- nextCharNonLF();
- break;
- case 'u':
- case 'U':
- case 'L':
- lexIntSuffix();
- break binaryLoop;
- default:
- break binaryLoop;
- }
- }
- setTokenValue();
- }
+ void lexBinary()
+ {
+ current.type = TokenType.intLiteral;
+ binaryLoop: while (!isEoF())
+ {
+ switch (src.front)
+ {
+ case '0':
+ case '1':
+ case '_':
+ nextCharNonLF();
+ break;
+ case 'u':
+ case 'U':
+ case 'L':
+ lexIntSuffix();
+ break binaryLoop;
+ default:
+ break binaryLoop;
+ }
+ }
+ setTokenValue();
+ }
- void lexHex()
- {
- current.type = TokenType.intLiteral;
- bool foundDot;
- hexLoop: while (!isEoF())
- {
- switch (src.front)
- {
- case 'a': .. case 'f':
- case 'A': .. case 'F':
- case '0': .. case '9':
- case '_':
- nextCharNonLF();
- break;
- case 'u':
- case 'U':
- lexIntSuffix();
- break hexLoop;
- case 'i':
- if (foundDot)
- lexFloatSuffix();
- break hexLoop;
- case 'L':
- if (foundDot)
- {
- lexFloatSuffix();
- break hexLoop;
- }
- else
- {
- lexIntSuffix();
- break hexLoop;
- }
- case 'p':
- case 'P':
- lexExponent();
- break hexLoop;
- case '.':
- if (foundDot)
- break hexLoop;
- if (src.canPeek() && src.peek() == '.')
- break hexLoop;
- nextCharNonLF();
- foundDot = true;
- current.type = TokenType.doubleLiteral;
- break;
- default:
- break hexLoop;
- }
- }
- setTokenValue();
- }
+ void lexHex()
+ {
+ current.type = TokenType.intLiteral;
+ bool foundDot;
+ hexLoop: while (!isEoF())
+ {
+ switch (src.front)
+ {
+ case 'a': .. case 'f':
+ case 'A': .. case 'F':
+ case '0': .. case '9':
+ case '_':
+ nextCharNonLF();
+ break;
+ case 'u':
+ case 'U':
+ lexIntSuffix();
+ break hexLoop;
+ case 'i':
+ if (foundDot)
+ lexFloatSuffix();
+ break hexLoop;
+ case 'L':
+ if (foundDot)
+ {
+ lexFloatSuffix();
+ break hexLoop;
+ }
+ else
+ {
+ lexIntSuffix();
+ break hexLoop;
+ }
+ case 'p':
+ case 'P':
+ lexExponent();
+ break hexLoop;
+ case '.':
+ if (foundDot)
+ break hexLoop;
+ if (src.canPeek() && src.peek() == '.')
+ break hexLoop;
+ nextCharNonLF();
+ foundDot = true;
+ current.type = TokenType.doubleLiteral;
+ break;
+ default:
+ break hexLoop;
+ }
+ }
+ setTokenValue();
+ }
- bool lexStringSuffix()
- {
- current.type = TokenType.stringLiteral;
- bool foundSuffix = false;
- if (!isEoF())
- {
- switch (src.front)
- {
- case 'w':
- current.type = TokenType.wstringLiteral;
- goto case 'c';
- case 'd':
- current.type = TokenType.dstringLiteral;
- goto case 'c';
- case 'c':
- foundSuffix = true;
- nextCharNonLF();
- break;
- default:
- break;
- }
- }
- return foundSuffix;
- }
+ bool lexStringSuffix()
+ {
+ current.type = TokenType.stringLiteral;
+ bool foundSuffix = false;
+ if (!isEoF())
+ {
+ switch (src.front)
+ {
+ case 'w':
+ current.type = TokenType.wstringLiteral;
+ goto case 'c';
+ case 'd':
+ current.type = TokenType.dstringLiteral;
+ goto case 'c';
+ case 'c':
+ foundSuffix = true;
+ nextCharNonLF();
+ break;
+ default:
+ break;
+ }
+ }
+ return foundSuffix;
+ }
- void lexCharacterLiteral()
- in
- {
- assert (src.front == '\'');
- }
- body
- {
- current.type = TokenType.characterLiteral;
- nextChar();
- if (isEoF())
- {
- errorMessage("Unterminated character literal");
- return;
- }
- switch (src.front)
- {
- case '\'':
- break;
- case '\\':
- if (config.tokenStyle & TokenStyle.notEscaped)
- skipEscapeSequence();
- else
- {
- // the only special path
- // 40 bytes is enough for 2 quotes
- // and the longest character entity
- ubyte[40] utf8;
- size_t len;
- if (config.tokenStyle & TokenStyle.includeQuotes)
- {
- utf8[0] = '\'';
- len = decodeEscapeSequence(utf8[1..$]);
- utf8[len++] = '\'';
- }
- else
- len = decodeEscapeSequence(utf8[]);
- if (src.front != '\'')
- {
- errorMessage("Expected \"'\" to end character literal");
- }
- // skip over last "'"
- nextChar();
- setTokenValue(utf8[0..len]);
- return;
- }
- break;
- default:
- if (src.front & 0x80)
- {
- while (src.front & 0x80)
- nextChar();
- break;
- }
- else
- {
- nextChar();
- break;
- }
- }
- if (src.front != '\'')
- errorMessage("Expected \"'\" to end character literal");
- nextChar();
- if (config.tokenStyle & TokenStyle.includeQuotes)
- setTokenValue();
- else
- setTokenValue(1, -1);
- }
+ void lexCharacterLiteral()
+ in
+ {
+ assert (src.front == '\'');
+ }
+ body
+ {
+ current.type = TokenType.characterLiteral;
+ nextChar();
+ if (isEoF())
+ {
+ errorMessage("Unterminated character literal");
+ return;
+ }
+ switch (src.front)
+ {
+ case '\'':
+ break;
+ case '\\':
+ if (config.tokenStyle & TokenStyle.notEscaped)
+ skipEscapeSequence();
+ else
+ {
+ // the only special path
+ // 40 bytes is enough for 2 quotes
+ // and the longest character entity
+ ubyte[40] utf8;
+ size_t len;
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ {
+ utf8[0] = '\'';
+ len = decodeEscapeSequence(utf8[1..$]);
+ utf8[len++] = '\'';
+ }
+ else
+ len = decodeEscapeSequence(utf8[]);
+ if (src.front != '\'')
+ {
+ errorMessage("Expected \"'\" to end character literal");
+ }
+ // skip over last "'"
+ nextChar();
+ setTokenValue(utf8[0..len]);
+ return;
+ }
+ break;
+ default:
+ if (src.front & 0x80)
+ {
+ while (src.front & 0x80)
+ nextChar();
+ break;
+ }
+ else
+ {
+ nextChar();
+ break;
+ }
+ }
+ if (src.front != '\'')
+ errorMessage("Expected \"'\" to end character literal");
+ nextChar();
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ setTokenValue();
+ else
+ setTokenValue(1, -1);
+ }
- void lexString()
- in
- {
- assert (src.front == '"');
- }
- body
- {
- current.type = TokenType.stringLiteral;
- bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r"
- bool isWysiwyg = src.front == '`';
- // in case we need to unescape string
- Appender!(ubyte[]) unescaped;
- auto quote = src.front;
- nextChar();
- while (true)
- {
- if (isEoF())
- {
- errorMessage("Unterminated string literal");
- return;
- }
- else if (src.front == '\\')
- {
- if (isWysiwyg || longWysiwg)
- nextChar();
- else if(config.tokenStyle & TokenStyle.notEscaped)
- {
- skipEscapeSequence();
- }
- else
- {
- if(unescaped == Appender!(ubyte[]).init)
- unescaped = appender!(ubyte[])();
- unescaped.put(src.slice());
- decodeEscapeSequence(unescaped);
- src.mark(); //start next slice after escape sequence
- }
- }
- else if (src.front == quote)
- {
- nextCharNonLF();
- break;
- }
- else
- nextChar();
- }
- lexStringSuffix();
- // helper to handle quotes
- void setData(R)(R range)
- {
- if (config.tokenStyle & TokenStyle.includeQuotes)
- setTokenValue(range);
- else if (longWysiwg)
- setTokenValue(range[2..$-1]);
- else
- setTokenValue(range[1..$-1]);
- }
- import std.stdio;
- if(unescaped != Appender!(ubyte[]).init)
- {
- //stuff in the last slice and use buffered data
- unescaped.put(src.slice);
- setData(unescaped.data);
- }
- else
- {
- setData(src.slice); //slice directly
- }
- }
+ void lexString()
+ in
+ {
+ assert (src.front == '"');
+ }
+ body
+ {
+ current.type = TokenType.stringLiteral;
+ bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r"
+ bool isWysiwyg = src.front == '`';
+ // in case we need to unescape string
+ Appender!(ubyte[]) unescaped;
+ auto quote = src.front;
+ nextChar();
+ while (true)
+ {
+ if (isEoF())
+ {
+ errorMessage("Unterminated string literal");
+ return;
+ }
+ else if (src.front == '\\')
+ {
+ if (isWysiwyg || longWysiwg)
+ nextChar();
+ else if(config.tokenStyle & TokenStyle.notEscaped)
+ {
+ skipEscapeSequence();
+ }
+ else
+ {
+ if(unescaped == Appender!(ubyte[]).init)
+ unescaped = appender!(ubyte[])();
+ unescaped.put(src.slice());
+ decodeEscapeSequence(unescaped);
+ src.mark(); //start next slice after escape sequence
+ }
+ }
+ else if (src.front == quote)
+ {
+ nextCharNonLF();
+ break;
+ }
+ else
+ nextChar();
+ }
+ lexStringSuffix();
+ // helper to handle quotes
+ void setData(R)(R range)
+ {
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ setTokenValue(range);
+ else if (longWysiwg)
+ setTokenValue(range[2..$-1]);
+ else
+ setTokenValue(range[1..$-1]);
+ }
+ import std.stdio;
+ if(unescaped != Appender!(ubyte[]).init)
+ {
+ //stuff in the last slice and use buffered data
+ unescaped.put(src.slice);
+ setData(unescaped.data);
+ }
+ else
+ {
+ setData(src.slice); //slice directly
+ }
+ }
- void lexDelimitedString()
- in
- {
- assert(src.front == '"');
- }
- body
- {
- current.type = TokenType.stringLiteral;
+ void lexDelimitedString()
+ in
+ {
+ assert(src.front == '"');
+ }
+ body
+ {
+ current.type = TokenType.stringLiteral;
- nextChar();
+ nextChar();
- bool heredoc;
- ubyte open;
- ubyte close;
+ bool heredoc;
+ ubyte open;
+ ubyte close;
- switch (src.front)
- {
- case '[': open = '['; close = ']'; break;
- case '{': open = '{'; close = '}'; break;
- case '(': open = '('; close = ')'; break;
- case '<': open = '<'; close = '>'; break;
- default: heredoc = true; break;
- }
- if (heredoc)
- lexHeredocString();
- else
- lexNormalDelimitedString(open, close);
- }
+ switch (src.front)
+ {
+ case '[': open = '['; close = ']'; break;
+ case '{': open = '{'; close = '}'; break;
+ case '(': open = '('; close = ')'; break;
+ case '<': open = '<'; close = '>'; break;
+ default: heredoc = true; break;
+ }
+ if (heredoc)
+ lexHeredocString();
+ else
+ lexNormalDelimitedString(open, close);
+ }
- void lexNormalDelimitedString(ubyte open, ubyte close)
- in
- {
- assert(src.slice[0 .. 2] == `q"`);
- }
- body
- {
- current.type = TokenType.stringLiteral;
- int depth = 1;
- nextChar();
- while (true)
- {
- if (isEoF())
- {
- errorMessage("Unterminated string literal");
- break;
- }
- if (src.front == open)
- {
- nextChar();
- ++depth;
- }
- else if (src.front == close)
- {
- nextChar();
- --depth;
- if (depth <= 0)
- {
- auto r = src.save(); //TODO: allocates for Fwd range
- if (r.front == '"')
- {
- nextChar();
- break;
- }
- else
- {
- errorMessage("Expected \" after balanced "
- ~ cast(char) close ~ " but found "
- ~ cast(char) r.front ~ " instead.");
- break;
- }
- }
- }
- else
- nextChar();
- }
- if (config.tokenStyle & TokenStyle.includeQuotes)
- setTokenValue();
- else
- setTokenValue(3, -2);
- }
+ void lexNormalDelimitedString(ubyte open, ubyte close)
+ in
+ {
+ assert(src.slice[0 .. 2] == `q"`);
+ }
+ body
+ {
+ current.type = TokenType.stringLiteral;
+ int depth = 1;
+ nextChar();
+ while (true)
+ {
+ if (isEoF())
+ {
+ errorMessage("Unterminated string literal");
+ break;
+ }
+ if (src.front == open)
+ {
+ nextChar();
+ ++depth;
+ }
+ else if (src.front == close)
+ {
+ nextChar();
+ --depth;
+ if (depth <= 0)
+ {
+ auto r = src.save(); //TODO: allocates for Fwd range
+ if (r.front == '"')
+ {
+ nextChar();
+ break;
+ }
+ else
+ {
+ errorMessage("Expected \" after balanced "
+ ~ cast(char) close ~ " but found "
+ ~ cast(char) r.front ~ " instead.");
+ break;
+ }
+ }
+ }
+ else
+ nextChar();
+ }
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ setTokenValue();
+ else
+ setTokenValue(3, -2);
+ }
- void lexHeredocString()
- in
- {
- assert (src.slice.equal("q\""));
- }
- body
- {
- typeof(src.slice) ident;
- uint newlineBytes;
- while (true)
- {
- if (isEoF())
- {
- errorMessage("Unterminated string literal");
- return;
- }
- else if (isNewline(src.front))
- {
- ident = src.slice[2..$];
- nextChar();
- newlineBytes = cast(uint) (src.slice.length - 2 - ident.length);
- break;
- }
- else if (isSeparating())
- {
- nextChar();
- ident = src.slice[2..$];
- nextChar();
- newlineBytes = 0;
- break;
- }
- else
- {
- nextChar();
- }
- }
- while (true)
- {
- if (isEoF())
- {
- errorMessage("Unterminated string literal");
- break;
- }
- else if (src.slice.length > ident.length
- && src.slice[$-ident.length .. $].equal(ident))
- {
- if (src.front == '"')
- {
- nextChar();
- lexStringSuffix();
- break;
- }
- else
- {
- errorMessage("Unterminated string literal: " ~ cast(string) src.slice);
- break;
- }
- }
- else
- nextChar();
- }
+ void lexHeredocString()
+ in
+ {
+ assert (src.slice.equal("q\""));
+ }
+ body
+ {
+ typeof(src.slice) ident;
+ uint newlineBytes;
+ while (true)
+ {
+ if (isEoF())
+ {
+ errorMessage("Unterminated string literal");
+ return;
+ }
+ else if (isNewline(src.front))
+ {
+ ident = src.slice[2..$];
+ nextChar();
+ newlineBytes = cast(uint) (src.slice.length - 2 - ident.length);
+ break;
+ }
+ else if (isSeparating())
+ {
+ nextChar();
+ ident = src.slice[2..$];
+ nextChar();
+ newlineBytes = 0;
+ break;
+ }
+ else
+ {
+ nextChar();
+ }
+ }
+ while (true)
+ {
+ if (isEoF())
+ {
+ errorMessage("Unterminated string literal");
+ break;
+ }
+ else if (src.slice.length > ident.length
+ && src.slice[$-ident.length .. $].equal(ident))
+ {
+ if (src.front == '"')
+ {
+ nextChar();
+ lexStringSuffix();
+ break;
+ }
+ else
+ {
+ errorMessage("Unterminated string literal: " ~ cast(string) src.slice);
+ break;
+ }
+ }
+ else
+ nextChar();
+ }
- bool hasSuffix = lexStringSuffix();
+ bool hasSuffix = lexStringSuffix();
- if (config.tokenStyle & TokenStyle.includeQuotes)
- setTokenValue();
- else
- {
- setTokenValue(cast(int) (2 + newlineBytes + ident.length),
- cast(int) (-(ident.length + (hasSuffix ? 2 : 1))));
- }
- }
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ setTokenValue();
+ else
+ {
+ setTokenValue(cast(int) (2 + newlineBytes + ident.length),
+ cast(int) (-(ident.length + (hasSuffix ? 2 : 1))));
+ }
+ }
- void lexTokenString()
- in
- {
- assert (src.front == '{');
- }
- body
- {
- current.type = TokenType.stringLiteral;
- nextChar();
- auto app = appender!(ubyte[])();
- if (config.tokenStyle & TokenStyle.includeQuotes)
- {
- app.put('q');
- app.put('{');
- }
- LexerConfig c = config;
- scope (exit) config = c;
- config.iterStyle = IterationStyle.everything;
- config.tokenStyle = TokenStyle.source;
- int depth = 1;
+ void lexTokenString()
+ in
+ {
+ assert (src.front == '{');
+ }
+ body
+ {
+ current.type = TokenType.stringLiteral;
+ nextChar();
+ auto app = appender!(ubyte[])();
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ {
+ app.put('q');
+ app.put('{');
+ }
+ LexerConfig c = config;
+ scope (exit) config = c;
+ config.iterStyle = IterationStyle.everything;
+ config.tokenStyle = TokenStyle.source;
+ int depth = 1;
- while (!isEoF())
- {
- advance();
- if (current.type == TokenType.lBrace)
- ++depth;
- else if (current.type == TokenType.rBrace)
- {
- --depth;
- if (depth <= 0)
- break;
- }
- app.put(representation(current.value));
- }
- config = c;
- if (config.tokenStyle & TokenStyle.includeQuotes)
- {
- app.put('}');
- }
- if (src.empty)
- current.type = TokenType.stringLiteral;
- else
- {
- switch (src.front)
- {
- case 'd':
- if (config.tokenStyle & TokenStyle.includeQuotes)
- app.put('d');
- current.type = TokenType.dstringLiteral;
- src.popFront();
- break;
- case 'w':
- if (config.tokenStyle & TokenStyle.includeQuotes)
- app.put('w');
- current.type = TokenType.wstringLiteral;
- src.popFront();
- break;
- case 'c':
- if (config.tokenStyle & TokenStyle.includeQuotes)
- app.put('c');
- src.popFront();
- goto default;
- default:
- current.type = TokenType.stringLiteral;
- break;
- }
- }
- current.value = cast(string) app.data;
- }
+ while (!isEoF())
+ {
+ advance();
+ if (current.type == TokenType.lBrace)
+ ++depth;
+ else if (current.type == TokenType.rBrace)
+ {
+ --depth;
+ if (depth <= 0)
+ break;
+ }
+ app.put(representation(current.value));
+ }
+ config = c;
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ {
+ app.put('}');
+ }
+ if (src.empty)
+ current.type = TokenType.stringLiteral;
+ else
+ {
+ switch (src.front)
+ {
+ case 'd':
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ app.put('d');
+ current.type = TokenType.dstringLiteral;
+ src.popFront();
+ break;
+ case 'w':
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ app.put('w');
+ current.type = TokenType.wstringLiteral;
+ src.popFront();
+ break;
+ case 'c':
+ if (config.tokenStyle & TokenStyle.includeQuotes)
+ app.put('c');
+ src.popFront();
+ goto default;
+ default:
+ current.type = TokenType.stringLiteral;
+ break;
+ }
+ }
+ current.value = cast(string) app.data;
+ }
- void lexSpecialTokenSequence()
- in
- {
- assert (src.front == '#');
- }
- body
- {
- nextChar();
- auto r = src.save();
- auto app = appender!(ubyte[])();
- app.put('#');
- while (true)
- {
- if (r.isRangeEoF())
- {
- errorMessage("Found EOF when interpreting special token sequence");
- return;
- }
- else if (isNewline(r.front))
- break;
- else
- {
- app.put(r.front);
- r.popFront();
- }
- }
- auto m = match((cast(char[]) app.data),
- `#line\s+(?P