// Written in the D programming language /** * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on * ASCII or UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * * To use the _lexer, create a LexerConfig struct * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; * config.tokenStyle = TokenStyle.source; * config.versionNumber = 2064; * config.vendorString = "Lexer Example"; * --- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your * source code, passing in the configuration. * --- * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); * --- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can * be easily used with the algorithms from std.algorithm or iterated over with * $(D_KEYWORD foreach) * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); * assert (tokens.front.line == 1); * assert (tokens.front.startIndex == 0); * --- * * Examples: * * Generate HTML markup of D code. * --- * module highlighter; * * import std.stdio; * import std.array; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * // http://ethanschoonover.com/solarized * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * * * * * * *

]");
 *
 *     foreach (Token t; tokens)
 *     {
 *         if (isBuiltType(t.type))
 *             writeSpan("type", t.value);
 *         else if (isKeyword(t.type))
 *             writeSpan("kwrd", t.value);
 *         else if (t.type == TokenType.comment)
 *             writeSpan("com", t.value);
 *         else if (isStringLiteral(t.type))
 *             writeSpan("str", t.value);
 *         else if (isNumberLiteral(t.type))
 *             writeSpan("num", t.value);
 *         else if (isOperator(t.type))
 *             writeSpan("op", t.value);
 *         else
 *             stdout.write(t.value.replace("<", "<"));
 *     }
 *     stdout.writeln("

\n"); * } * * void main(string[] args) * { * LexerConfig config; * config.tokenStyle = TokenStyle.source; * config.iterStyle = IterationStyle.everything; * config.fileName = args[1]; * auto f = File(args[1]); * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); * } * --- * * Copyright: Brian Schott 2013 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) * Authors: Brian Schott, Dmitry Olshansky * Source: $(PHOBOSSRC std/d/_lexer.d) */ module stdx.d.lexer; import std.algorithm; import std.ascii; import std.conv; import std.datetime; import stdx.d.entities; import std.exception; import std.range; import std.regex; import std.string; import std.traits; import std.utf; version (unittest) import std.stdio; public: /** * Represents a D token */ struct Token { /** * The representation of the token in the original source code. */ string value; /** * The index of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ size_t startIndex; /** * The number of the line the token is on. */ uint line; /** * The column number of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ ushort column; /** * The token type. */ TokenType type; /** * Check to see if the token is of the same type and has the same string * representation as the given token. */ bool opEquals(ref const(Token) other) const nothrow pure { return other.type == type && other.value == value; } /** * Checks to see if the token's string representation is equal to the given * string. */ bool opEquals(string value) const nothrow pure { return this.value == value; } /** * Checks to see if the token is of the given type. */ bool opEquals(TokenType type) const nothrow pure { return this.type == type; } /** * Comparison operator orders tokens by start index. */ int opCmp(ref const(Token) other) const nothrow pure { if (startIndex < other.startIndex) return -1; if (startIndex > other.startIndex) return 1; return 0; } int opCmp(size_t index) const nothrow pure { if (startIndex < index) return -1; if (startIndex > index) return 1; return 0; } } /** * Configure the behavior of the byToken() function. These flags may be * combined using a bitwise or. */ enum IterationStyle : ushort { /// Only include code, not whitespace or comments codeOnly = 0, /// Includes comments includeComments = 0b0001, /// Includes whitespace includeWhitespace = 0b0010, /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) includeSpecialTokens = 0b0100, /// Do not stop iteration on reaching the $(D_KEYWORD ___EOF__) token ignoreEOF = 0b1000, /// Include _everything everything = includeComments | includeWhitespace | ignoreEOF } /** * Configuration of the token lexing style. These flags may be combined with a * bitwise or. */ enum TokenStyle : ushort { /** * Escape sequences will be replaced with their equivalent characters, * enclosing quote characters will not be included. Special tokens such as * $(D_KEYWORD ___VENDOR__) will be replaced with their equivalent strings. * Useful for creating a compiler or interpreter. */ default_ = 0b0000, /** * Escape sequences will not be processed. An escaped quote character will * not terminate string lexing, but it will not be replaced with the quote * character in the token. */ notEscaped = 0b0001, /** * Strings will include their opening and closing quote characters as well * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will * include the $(D_STRING 'w') character as well as the opening and closing * quotes$(RPAREN) */ includeQuotes = 0b0010, /** * Do not replace the value field of the special tokens such as * $(D_KEYWORD ___DATE__) with their string equivalents. */ doNotReplaceSpecial = 0b0100, /** * Strings will be read exactly as they appeared in the source, including * their opening and closing quote characters. Useful for syntax * highlighting. */ source = notEscaped | includeQuotes | doNotReplaceSpecial } /** * Lexer configuration */ struct LexerConfig { /** * Iteration style */ IterationStyle iterStyle = IterationStyle.codeOnly; /** * Token style */ TokenStyle tokenStyle = tokenStyle.default_; /** * Replacement for the $(D_KEYWORD ___VERSION__) token. Defaults to 100. */ uint versionNumber = 100; /** * Replacement for the $(D_KEYWORD ___VENDOR__) token. Defaults to $(D_STRING "std.d.lexer") */ string vendorString = "std.d.lexer"; /** * Name used when creating error messages that are sent to errorFunc. This * is needed because the lexer operates on any forwarad range of ASCII * characters or UTF-8 code units and does not know what to call its input * source. Defaults to the empty string. */ string fileName = ""; /** * This function is called when an error is encountered during lexing. * Parameters are file name, code uint index, line number, column, * and error messsage. */ void delegate(string, size_t, uint, uint, string) errorFunc; } /** * Iterate over the given range of characters by D tokens. * Params: * range = the range of characters * config = the lexer configuration * bufferSize = initial size of internal circular buffer * Returns: * an input range of tokens */ auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024) if (isForwardRange!(R) && !isRandomAccessRange!(R) && is(ElementType!R : const(ubyte))) { // 4K of circular buffer by default auto r = TokenRange!(typeof(lexerSource(range))) (lexerSource(range, bufferSize), config); r.config = config; r.lineNumber = 1; r.popFront(); return r; } ///ditto auto byToken(R)(R range, LexerConfig config) if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte))) { auto r = TokenRange!(typeof(lexerSource(range))) (lexerSource(range), config); r.config = config; r.lineNumber = 1; r.popFront(); return r; } /** * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. */ struct TokenRange(LexSrc) //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource { /** * Returns: true if the range is empty */ bool empty() const @property { return _empty; } /** * Returns: the current token */ ref const(Token) front() const @property { assert(!empty, "trying to get front of an empty token range"); return current; } /** * Returns the current token and then removes it from the range */ Token moveFront() { auto r = move(current); popFront(); return r; } /** * Removes the current token from the range */ void popFront() { advance(); } private: /* * Advances the range to the next token */ void advance() { L_advance: if (src.empty) { _empty = true; return; } src.mark(); // mark a start of a lexing "frame" current.line = lineNumber; current.startIndex = src.index; current.column = column; current.value = null; switch (src.front) { // handle sentenels for end of input case 0: case 0x1a: // TODO: check config flags, it's cheap // since this branch at most is taken once per file _empty = true; return; mixin(generateCaseTrie( "=", "TokenType.assign", "@", "TokenType.at", "&", "TokenType.bitAnd", "&=", "TokenType.bitAndEqual", "|", "TokenType.bitOr", "|=", "TokenType.bitOrEqual", "~=", "TokenType.catEqual", ":", "TokenType.colon", ",", "TokenType.comma", "--", "TokenType.decrement", "$", "TokenType.dollar", "==", "TokenType.equal", "=>", "TokenType.goesTo", ">", "TokenType.greater", ">=", "TokenType.greaterEqual", "++", "TokenType.increment", "{", "TokenType.lBrace", "[", "TokenType.lBracket", "<", "TokenType.less", "<=", "TokenType.lessEqual", "<>=", "TokenType.lessEqualGreater", "<>", "TokenType.lessOrGreater", "&&", "TokenType.logicAnd", "||", "TokenType.logicOr", "(", "TokenType.lParen", "-", "TokenType.minus", "-=", "TokenType.minusEqual", "%", "TokenType.mod", "%=", "TokenType.modEqual", "*=", "TokenType.mulEqual", "!", "TokenType.not", "!=", "TokenType.notEqual", "!>", "TokenType.notGreater", "!>=", "TokenType.notGreaterEqual", "!<", "TokenType.notLess", "!<=", "TokenType.notLessEqual", "!<>", "TokenType.notLessEqualGreater", "+", "TokenType.plus", "+=", "TokenType.plusEqual", "^^", "TokenType.pow", "^^=", "TokenType.powEqual", "}", "TokenType.rBrace", "]", "TokenType.rBracket", ")", "TokenType.rParen", ";", "TokenType.semicolon", "<<", "TokenType.shiftLeft", "<<=", "TokenType.shiftLeftEqual", ">>", "TokenType.shiftRight", ">>=", "TokenType.shiftRightEqual", "*", "TokenType.star", "?", "TokenType.ternary", "~", "TokenType.tilde", "!<>=", "TokenType.unordered", ">>>", "TokenType.unsignedShiftRight", ">>>=", "TokenType.unsignedShiftRightEqual", "^", "TokenType.xor", "^=", "TokenType.xorEqual" )); case '/': nextCharNonLF(); if (isEoF()) { current.type = TokenType.div; current.value = "/"; return; } switch (src.front) { case '/': case '*': case '+': if (config.iterStyle & IterationStyle.includeComments) return lexComment!true(); lexComment!false(); goto L_advance; // tail-recursion case '=': current.type = TokenType.divEqual; current.value = "/="; src.popFront(); return; default: current.type = TokenType.div; current.value = "/"; return; } case '.': if (!src.canPeek()) { current.type = TokenType.dot; current.value = tokenValue!(TokenType.dot); return; } switch (src.peek()) { case '0': .. case '9': lexNumber(); return; case '.': nextCharNonLF(); nextCharNonLF(); current.type = TokenType.slice; if (src.front == '.') { current.type = TokenType.vararg; nextCharNonLF(); current.value = tokenValue!(TokenType.vararg); } else current.value = tokenValue!(TokenType.slice); return; default: nextCharNonLF(); current.type = TokenType.dot; current.value = tokenValue!(TokenType.dot); return; } case '0': .. case '9': lexNumber(); return; case '\'': lexCharacterLiteral(); return; case '"': case '`': lexString(); return; case 'q': nextCharNonLF(); if (isEoF()) goto default; switch (src.front) { case '{': lexTokenString(); return; case '"': lexDelimitedString(); return; default: break; } goto default; case 'r': nextCharNonLF(); if (isEoF()) goto default; else if (src.front == '"') { lexString(); return; } else goto default; case 'x': nextCharNonLF(); if (isEoF()) goto default; else if (src.front == '"') { lexHexString(); return; } else goto default; case '#': lexSpecialTokenSequence(); if(config.iterStyle & IterationStyle.includeSpecialTokens) return; goto L_advance; // tail-recursion // "short" ASCII whites case 0x20: case 0x09: .. case 0x0d: if (config.iterStyle & IterationStyle.includeWhitespace) return lexWhitespace!true(); lexWhitespace!false(); goto L_advance; // tail-recursion default: if ((src.front & 0x80) && isLongWhite()) { if (config.iterStyle & IterationStyle.includeWhitespace) return lexWhitespace!true(); lexWhitespace!false(); goto L_advance; // tail-recursion } for(;;) { if(isSeparating()) break; nextCharNonLF(); if(isEoF()) break; } current.type = lookupTokenType(src.slice); current.value = getTokenValue(current.type); if (current.value is null) setTokenValue(); if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.specialEof) { _empty = true; return; } if (config.tokenStyle & TokenStyle.doNotReplaceSpecial) return; expandSpecialToken(); } } // TODO: LexSource could be improved for forward ranges // to avoid buffering at all (by disabling it for a moment) // so keep the 'keep' parameter here and elsewhere void lexWhitespace(bool keep)() { current.type = TokenType.whitespace; do { nextChar(); } while (!isEoF() && isWhite()); static if (keep) setTokenValue(); } void lexComment(bool keep)() in { assert (src.front == '/' || src.front == '*' || src.front == '+'); } body { current.type = TokenType.comment; switch(src.front) { case '/': while (!isEoF() && !isNewline(src.front)) { nextCharNonLF(); } break; case '*': while (!isEoF()) { if (src.front == '*') { static if (keep) nextCharNonLF(); else src.popFront(); if (src.front == '/') { nextCharNonLF(); break; } } else nextChar(); } break; case '+': int depth = 1; while (depth > 0 && !isEoF()) { if (src.front == '+') { nextCharNonLF(); if (src.front == '/') { nextCharNonLF(); --depth; } } else if (src.front == '/') { nextCharNonLF(); if (src.front == '+') { nextCharNonLF(); ++depth; } } else nextChar(); } break; default: assert(false); } static if (keep) setTokenValue(); } void lexHexString() in { assert (src.front == '"'); } body { current.type = TokenType.stringLiteral; nextChar(); while (true) { if (isEoF()) { errorMessage("Unterminated hex string literal"); return; } else if (isHexDigit(src.front)) { nextCharNonLF(); } else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) { nextChar(); } else if (src.front == '"') { nextCharNonLF(); break; } else { errorMessage(format("Invalid character '%s' in hex string literal", cast(char) src.front)); return; } } bool hasSuffix = lexStringSuffix(); if (config.tokenStyle & TokenStyle.notEscaped) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(2, hasSuffix ? -2 : -1); } else { // TODO: appender is an allocation happy fat pig // remove it later auto a = appender!(char[])(); foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2)) { auto s = cast(char[])b; ubyte ch = cast(ubyte)parse!uint(s, 16); a.put(ch); } // can safely assume ownership of data current.value = cast(string)a.data; } } void lexNumber() in { assert(isDigit(src.front) || src.front == '.'); } body { if (src.front != '0') { lexDecimal(); return; } else { switch (src.peek()) { case 'x': case 'X': nextCharNonLF(); nextCharNonLF(); lexHex(); break; case 'b': case 'B': nextCharNonLF(); nextCharNonLF(); lexBinary(); break; default: lexDecimal(); break; } } } void lexFloatSuffix() { switch (src.front) { case 'L': nextCharNonLF(); current.type = TokenType.doubleLiteral; break; case 'f': case 'F': nextCharNonLF(); current.type = TokenType.floatLiteral; break; default: break; } if (!isEoF() && src.front == 'i') { nextCharNonLF(); if (current.type == TokenType.floatLiteral) current.type = TokenType.ifloatLiteral; else current.type = TokenType.idoubleLiteral; } } void lexIntSuffix() { bool foundU; bool foundL; while (!isEoF()) { switch (src.front) { case 'u': case 'U': if (foundU) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.uintLiteral; nextCharNonLF(); break; case TokenType.longLiteral: current.type = TokenType.ulongLiteral; nextCharNonLF(); break; default: assert (false); } foundU = true; break; case 'L': if (foundL) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.longLiteral; nextCharNonLF(); break; case TokenType.uintLiteral: current.type = TokenType.ulongLiteral; nextCharNonLF(); break; default: assert (false); } foundL = true; break; default: return; } } } void lexExponent() in { assert (src.front == 'e' || src.front == 'E' || src.front == 'p' || src.front == 'P'); } body { nextCharNonLF(); bool foundSign = false; bool foundDigit = false; while (!isEoF()) { switch (src.front) { case '-': case '+': if (foundSign) { if (!foundDigit) errorMessage("Expected an exponent"); return; } foundSign = true; nextCharNonLF(); break; case '0': .. case '9': case '_': foundDigit = true; nextCharNonLF(); break; case 'L': case 'f': case 'F': case 'i': lexFloatSuffix(); return; default: if (!foundDigit) errorMessage("Expected an exponent"); return; } } } void lexDecimal() in { assert (isDigit(src.front) || src.front == '.'); } body { bool foundDot = src.front == '.'; if (foundDot) nextCharNonLF(); current.type = TokenType.intLiteral; decimalLoop: while (!isEoF()) { switch (src.front) { case '0': .. case '9': case '_': nextCharNonLF(); break; case 'u': case 'U': if (!foundDot) lexIntSuffix(); break decimalLoop; case 'i': lexFloatSuffix(); break decimalLoop; case 'L': if (foundDot) lexFloatSuffix(); else lexIntSuffix(); break decimalLoop; case 'f': case 'F': lexFloatSuffix(); break decimalLoop; case 'e': case 'E': lexExponent(); break decimalLoop; case '.': if (foundDot) break decimalLoop; if (src.canPeek() && src.peek() == '.') break decimalLoop; else { // The following bit of silliness tries to tell the // difference between "int dot identifier" and // "double identifier". if (src.canPeek()) { switch (src.peek()) { case 'u': case 'U': case 'i': case 'L': case 'f': case 'F': case 'e': case 'E': break decimalLoop; default: goto doubleLiteral; } } else { doubleLiteral: nextCharNonLF(); foundDot = true; current.type = TokenType.doubleLiteral; } } break; default: break decimalLoop; } } setTokenValue(); } void lexBinary() { current.type = TokenType.intLiteral; binaryLoop: while (!isEoF()) { switch (src.front) { case '0': case '1': case '_': nextCharNonLF(); break; case 'u': case 'U': case 'L': lexIntSuffix(); break binaryLoop; default: break binaryLoop; } } setTokenValue(); } void lexHex() { current.type = TokenType.intLiteral; bool foundDot; hexLoop: while (!isEoF()) { switch (src.front) { case 'a': .. case 'f': case 'A': .. case 'F': case '0': .. case '9': case '_': nextCharNonLF(); break; case 'u': case 'U': lexIntSuffix(); break hexLoop; case 'i': if (foundDot) lexFloatSuffix(); break hexLoop; case 'L': if (foundDot) { lexFloatSuffix(); break hexLoop; } else { lexIntSuffix(); break hexLoop; } case 'p': case 'P': lexExponent(); break hexLoop; case '.': if (foundDot) break hexLoop; if (src.canPeek() && src.peek() == '.') break hexLoop; nextCharNonLF(); foundDot = true; current.type = TokenType.doubleLiteral; break; default: break hexLoop; } } setTokenValue(); } bool lexStringSuffix() { current.type = TokenType.stringLiteral; bool foundSuffix = false; if (!isEoF()) { switch (src.front) { case 'w': current.type = TokenType.wstringLiteral; goto case 'c'; case 'd': current.type = TokenType.dstringLiteral; goto case 'c'; case 'c': foundSuffix = true; nextCharNonLF(); break; default: break; } } return foundSuffix; } void lexCharacterLiteral() in { assert (src.front == '\''); } body { current.type = TokenType.characterLiteral; nextChar(); if (isEoF()) { errorMessage("Unterminated character literal"); return; } switch (src.front) { case '\'': break; case '\\': if (config.tokenStyle & TokenStyle.notEscaped) skipEscapeSequence(); else { // the only special path // 40 bytes is enough for 2 quotes // and the longest character entity ubyte[40] utf8; size_t len; if (config.tokenStyle & TokenStyle.includeQuotes) { utf8[0] = '\''; len = decodeEscapeSequence(utf8[1..$]); utf8[len++] = '\''; } else len = decodeEscapeSequence(utf8[]); if (src.front != '\'') { errorMessage("Expected \"'\" to end character literal"); } // skip over last "'" nextChar(); setTokenValue(utf8[0..len]); return; } break; default: if (src.front & 0x80) { while (src.front & 0x80) nextChar(); break; } else { nextChar(); break; } } if (src.front != '\'') errorMessage("Expected \"'\" to end character literal"); nextChar(); if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(1, -1); } void lexString() in { //assert (src.front == '"'); } body { current.type = TokenType.stringLiteral; bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r" bool isWysiwyg = src.front == '`'; // in case we need to unescape string Appender!(ubyte[]) unescaped; auto quote = src.front; nextChar(); while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); return; } else if (src.front == '\\') { if (isWysiwyg || longWysiwg) nextChar(); else if(config.tokenStyle & TokenStyle.notEscaped) { skipEscapeSequence(); } else { if(unescaped == Appender!(ubyte[]).init) unescaped = appender!(ubyte[])(); unescaped.put(src.slice()); decodeEscapeSequence(unescaped); src.mark(); //start next slice after escape sequence } } else if (src.front == quote) { nextCharNonLF(); break; } else nextChar(); } lexStringSuffix(); // helper to handle quotes void setData(R)(R range) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(range); else if (longWysiwg) setTokenValue(range[2..$-1]); else setTokenValue(range[1..$-1]); } import std.stdio; if(unescaped != Appender!(ubyte[]).init) { //stuff in the last slice and use buffered data unescaped.put(src.slice); setData(unescaped.data); } else { setData(src.slice); //slice directly } } void lexDelimitedString() in { assert(src.front == '"'); } body { current.type = TokenType.stringLiteral; nextChar(); bool heredoc; ubyte open; ubyte close; switch (src.front) { case '[': open = '['; close = ']'; break; case '{': open = '{'; close = '}'; break; case '(': open = '('; close = ')'; break; case '<': open = '<'; close = '>'; break; default: heredoc = true; break; } if (heredoc) lexHeredocString(); else lexNormalDelimitedString(open, close); } void lexNormalDelimitedString(ubyte open, ubyte close) in { assert(src.slice[0 .. 2] == `q"`); } body { current.type = TokenType.stringLiteral; int depth = 1; nextChar(); while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); break; } if (src.front == open) { nextChar(); ++depth; } else if (src.front == close) { nextChar(); --depth; if (depth <= 0) { auto r = src.save(); //TODO: allocates for Fwd range if (r.front == '"') { nextChar(); break; } else { errorMessage("Expected \" after balanced " ~ cast(char) close ~ " but found " ~ cast(char) r.front ~ " instead."); break; } } } else nextChar(); } if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(3, -2); } void lexHeredocString() in { assert (src.slice.equal("q\"")); } body { typeof(src.slice) ident; uint newlineBytes; while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); return; } else if (isNewline(src.front)) { ident = src.slice[2..$]; nextChar(); newlineBytes = cast(uint) (src.slice.length - 2 - ident.length); break; } else if (isSeparating()) { nextChar(); ident = src.slice[2..$]; nextChar(); newlineBytes = 0; break; } else { nextChar(); } } while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); break; } else if (src.slice.length > ident.length && src.slice[$-ident.length .. $].equal(ident)) { if (src.front == '"') { nextChar(); lexStringSuffix(); break; } else { errorMessage("Unterminated string literal: " ~ cast(string) src.slice); break; } } else nextChar(); } bool hasSuffix = lexStringSuffix(); if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else { setTokenValue(cast(int) (2 + newlineBytes + ident.length), cast(int) (-(ident.length + (hasSuffix ? 2 : 1)))); } } void lexTokenString() in { assert (src.front == '{'); } body { current.type = TokenType.stringLiteral; nextChar(); auto app = appender!(ubyte[])(); if (config.tokenStyle & TokenStyle.includeQuotes) { app.put('q'); app.put('{'); } LexerConfig c = config; scope (exit) config = c; config.iterStyle = IterationStyle.everything; config.tokenStyle = TokenStyle.source; int depth = 1; while (!isEoF()) { advance(); if (current.type == TokenType.lBrace) ++depth; else if (current.type == TokenType.rBrace) { --depth; if (depth <= 0) break; } app.put(representation(current.value)); } config = c; if (config.tokenStyle & TokenStyle.includeQuotes) { app.put('}'); } if (src.empty) current.type = TokenType.stringLiteral; else { switch (src.front) { case 'd': if (config.tokenStyle & TokenStyle.includeQuotes) app.put('d'); current.type = TokenType.dstringLiteral; src.popFront(); break; case 'w': if (config.tokenStyle & TokenStyle.includeQuotes) app.put('w'); current.type = TokenType.wstringLiteral; src.popFront(); break; case 'c': if (config.tokenStyle & TokenStyle.includeQuotes) app.put('c'); src.popFront(); goto default; default: current.type = TokenType.stringLiteral; break; } } current.value = cast(string) app.data; } void lexSpecialTokenSequence() in { assert (src.front == '#'); } body { nextChar(); auto r = src.save(); auto app = appender!(ubyte[])(); app.put('#'); while (true) { if (r.isRangeEoF()) { errorMessage("Found EOF when interpreting special token sequence"); return; } else if (isNewline(r.front)) break; else { app.put(r.front); r.popFront(); } } auto m = match((cast(char[]) app.data), `#line\s+(?P\d+)\s*(?P".+")*?`); if (m) { current.type = TokenType.specialTokenSequence; current.value = (cast(char[]) app.data).idup; column += app.data.length; foreach (i; 0 .. app.data.length) src.popFront(); auto c = m.captures; if (c["filespec"]) config.fileName = c["filespec"].idup; auto l = c["line"]; lineNumber = parse!uint(l); } else { current.type = TokenType.hash; current.value = tokenValue!(TokenType.hash); } } //===================================================================== // Helpers for lexXYZ functions //===================================================================== void skipEscapeSequence() { // no decoding, just minor sanity checks nextChar(); switch (src.front) { case '\'': case '"': case '?': case '\\': case 'a': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': case 0x0a: case 0x00: nextChar(); return; case '0': .. case '7': foreach(i; 0 .. 3) { nextChar(); if (src.front < '0' || src.front > '7') return; } return; case 'x': nextChar(); foreach(i; 0 .. 2) { if (!isHexDigit(src.front)) { errorMessage("Expected hex digit"); return; } nextChar(); } return; case 'u': case 'U': uint digits = src.front == 'u' ? 4 : 8; nextChar(); foreach (i; 0 .. digits) { if (!isHexDigit(src.front)) { errorMessage("Expected hex digit instead of %s".format( cast(char) src.front)); return; } nextChar(); } return; case '&': while (!isEoF()) { nextChar(); if (src.front == ';') break; } return; default: errorMessage("Invalid escape sequence"); return; } } size_t decodeEscapeSequence(OutputRange)(OutputRange dest) in { assert (src.front == '\\'); } body { size_t reencodeNumeric(ubyte[] src, int radix, OutputRange dest) { char[] chunk = cast(char[])src; char[4] utfBuf; uint codepoint = parse!uint(chunk, radix); size_t len; try len = encode(utfBuf, codepoint); catch (UTFException ex) { errorMessage(ex.msg); return 0; } dest.put(cast(ubyte[]) utfBuf[0..len]); return len; } ubyte[40] buffer; src.popFront(); switch (src.front) { case '\'': case '"': case '?': case '\\': buffer[0] = src.front; src.popFront(); return 1; case 'a': dest.put('\a'); src.popFront(); return 1; case 'b': dest.put('\b'); src.popFront(); return 1; case 'f': dest.put('\f'); src.popFront(); return 1; case 'n': dest.put('\n'); src.popFront(); return 1; case 'r': dest.put('\r'); src.popFront(); return 1; case 't': dest.put('\t'); src.popFront(); return 1; case 'v': dest.put('\v'); src.popFront(); return 1; case 0x0a: dest.put(cast(ubyte)0x0a); src.popFront(); return 1; case 0x00: dest.put(cast(ubyte)0x00); src.popFront(); return 1; case '0': .. case '7': size_t idx = 0; while(idx < 3 && !isEoF()) { buffer[idx++] = src.front; src.popFront(); if (src.front < '0' || src.front > '7') break; } return reencodeNumeric(buffer[0..idx], 8, dest); case 'x': src.popFront(); foreach(i; 0 .. 2) { if (!isHexDigit(src.front)) { errorMessage("Expected hex digit"); return 1; } buffer[i] = src.front; src.popFront(); } return reencodeNumeric(buffer[0..2], 16, dest); case 'u': case 'U': uint digitCount = src.front == 'u' ? 4 : 8; src.popFront(); foreach (i; 0 .. digitCount) { if (!isHexDigit(src.front)) { errorMessage("Expected hex digit"); return 1; } buffer[i] = src.front; src.popFront(); } return reencodeNumeric(buffer[0..digitCount], 16, dest); case '&': src.popFront(); size_t idx = 0; while (!isEoF()) { if (isAlpha(src.front)) { buffer[idx++] = src.front; if(idx == buffer.length) // way over maximum length errorMessage("Invalid character entity"); src.popFront(); } else if (src.front == ';') { src.popFront(); break; } else { errorMessage("Invalid character entity"); return idx; } } //TODO: avoid looking up as UTF string, use raw bytes string chunk = cast(string)buffer[0..idx]; auto names = assumeSorted(map!"a.name"(characterEntities)); auto place = names.lowerBound(chunk).length; if (place == names.length || names[place] != chunk) { errorMessage("Invalid character entity \"&%s;\"" .format(cast(string) chunk)); return 1; } auto entity = characterEntities[place].value; dest.put(cast(ubyte[]) entity); return entity.length; default: errorMessage("Invalid escape sequence"); return 1; } } // advances underlying mark-slice range and counts lines, cols void nextChar() { bool foundNewline; if (src.front == '\r') { src.popFront(); foundNewline = true; } if (src.front == '\n') { src.popFront(); foundNewline = true; } else { src.popFront(); } if (foundNewline) { ++lineNumber; column = 0; } else ++column; } //same but don't bother for LF sequences void nextCharNonLF() { src.popFront(); ++column; } void setTokenValue()() { current.value = cache.get(src.slice); } void setTokenValue()(int startOffset, int endOffset) in { assert(startOffset >= 0); assert(endOffset <= 0); } body { auto piece = src.slice; // avoid unsigned arithmetic as endOffset is negative int end = cast(int)piece.length + endOffset; current.value = cache.get(src.slice[startOffset .. end]); } void setTokenValue(R)(R range) if(isRandomAccessRange!R && is(ElementType!R : const(ubyte))) { current.value = cache.get(range); } bool isEoF() const { return src.empty || src.front == 0 || src.front == 0x1a; } bool isSeparating() { auto ch = src.front; if (ch <= 0x2f) return true; if (ch >= ':' && ch <= '@') return true; if (ch >= '[' && ch <= '^') return true; if (ch >= '{' && ch <= '~') return true; if (ch == '`') return true; if ((ch & 0x80) && isLongWhite()) return true; return false; } bool isWhite() { auto c = src.front; if (c & 0x80) // multi-byte utf-8 { return isLongWhite(); } else return c == 0x20 || (c >= 0x09 && c <= 0x0d); } bool isLongWhite() { assert(src.front & 0x80); // only non-ascii //TODO: here and elsewhere we'd better have // some kind of lookahead in LexSource instead of .save auto r = src.save(); if (r.front != 0xe2) return false; else r.popFront(); if (r.empty || r.front != 0x80) return false; else r.popFront(); if (r.empty || (r.front != 0xa8 && r.front != 0xa9)) return false; return true; } void expandSpecialToken() { switch (current.type) { case TokenType.specialDate: current.type = TokenType.stringLiteral; auto time = Clock.currTime(); current.value = format("%s %02d %04d", time.month, time.day, time.year); return; case TokenType.specialTime: auto time = Clock.currTime(); current.type = TokenType.stringLiteral; current.value = (cast(TimeOfDay)(time)).toISOExtString(); return; case TokenType.specialTimestamp: auto time = Clock.currTime(); auto dt = cast(DateTime) time; current.type = TokenType.stringLiteral; current.value = format("%s %s %02d %02d:%02d:%02d %04d", dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.year); return; case TokenType.specialVendor: current.type = TokenType.stringLiteral; current.value = config.vendorString; return; case TokenType.specialVersion: current.type = TokenType.stringLiteral; current.value = format("%d", config.versionNumber); return; case TokenType.specialLine: current.type = TokenType.intLiteral; current.value = format("%d", current.line); return; case TokenType.specialFile: current.type = TokenType.stringLiteral; current.value = config.fileName; return; default: return; } } void errorMessage(string s) { import std.string: format; if (config.errorFunc !is null) config.errorFunc(config.fileName, current.startIndex, current.line, current.column, s); else throw new Exception(format("%s(%d:%d): %s", config.fileName, current.line, current.column, s)); } this(LexSrc lex, LexerConfig cfg) { src = move(lex); // lex is r-value lineNumber = 1; column = 0; _empty = false; config = move(cfg); // ditto with cfg cache = StringCache(initialTableSize); } enum initialTableSize = 2048; Token current; uint lineNumber; ushort column; LexSrc src; bool _empty; LexerConfig config; StringCache cache; } /** * Returns: true if the token is an operator */ pure nothrow bool isOperator(const TokenType t) { return t >= TokenType.assign && t <= TokenType.xorEqual; } /** * ditto */ pure nothrow bool isOperator(ref const Token t) { return isOperator(t.type); } /** * Returns: true if the token is a keyword */ pure nothrow bool isKeyword(const TokenType t) { return t >= TokenType.bool_ && t <= TokenType.with_; } /** * ditto */ pure nothrow bool isKeyword(ref const Token t) { return isKeyword(t.type); } /** * Returns: true if the token is a built-in type */ pure nothrow bool isBasicType(const TokenType t) { return t >= TokenType.bool_ && t <= TokenType.wchar_; } /** * ditto */ pure nothrow bool isBasicType(ref const Token t) { return isBasicType(t.type); } /** * Returns: true if the token is an attribute */ pure nothrow bool isAttribute(const TokenType t) { return t >= TokenType.align_ && t <= TokenType.static_; } /** * ditto */ pure nothrow bool isAttribute(ref const Token t) { return isAttribute(t.type); } /** * Returns: true if the token is a protection attribute */ pure nothrow bool isProtection(const TokenType t) { return t >= TokenType.export_ && t <= TokenType.public_; } /** * ditto */ pure nothrow bool isProtection(ref const Token t) { return isProtection(t.type); } /** * Returns: true if the token is a compile-time constant such as ___DATE__ */ pure nothrow bool isConstant(const TokenType t) { return t >= TokenType.specialDate && t <= TokenType.traits; } /** * ditto */ pure nothrow bool isConstant(ref const Token t) { return isConstant(t.type); } /** * Returns: true if the token is a string or number literal */ pure nothrow bool isLiteral(const TokenType t) { return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral; } /** * ditto */ pure nothrow bool isLiteral(ref const Token t) { return isLiteral(t.type); } /** * Returns: true if the token is a number literal */ pure nothrow bool isNumberLiteral(const TokenType t) { return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral; } /** * ditto */ pure nothrow bool isNumberLiteral(ref const Token t) { return isNumberLiteral(t.type); } /** * Returns: true if the token is a string literal */ pure nothrow bool isStringLiteral(const TokenType t) { return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral; } /** * ditto */ pure nothrow bool isStringLiteral(ref const Token t) { return isStringLiteral(t.type); } /** * Returns: true if the token is whitespace, a commemnt, a special token * sequence, or an identifier */ pure nothrow bool isMisc(const TokenType t) { return t >= TokenType.comment && t <= TokenType.specialTokenSequence; } /** * ditto */ pure nothrow bool isMisc(ref const Token t) { return isMisc(t.type); } /** * Listing of all the tokens in the D language. */ enum TokenType: ushort { invalid, /// Not a valid token assign, /// = at, /// @ bitAnd, /// & bitAndEqual, /// &= bitOr, /// | bitOrEqual, /// |= catEqual, /// ~= colon, /// : comma, /// , decrement, /// -- div, /// / divEqual, /// /= dollar, /// $ dot, /// . equal, /// == goesTo, /// => greater, /// > greaterEqual, /// >= hash, /// # increment, /// ++ lBrace, /// { lBracket, /// [ less, /// < lessEqual, /// <= lessEqualGreater, /// <>= lessOrGreater, /// <> logicAnd, /// && logicOr, /// || lParen, /// $(LPAREN) minus, /// - minusEqual, /// -= mod, /// % modEqual, /// %= mulEqual, /// *= not, /// ! notEqual, /// != notGreater, /// !> notGreaterEqual, /// !>= notLess, /// !< notLessEqual, /// !<= notLessEqualGreater, /// !<> plus, /// + plusEqual, /// += pow, /// ^^ powEqual, /// ^^= rBrace, /// } rBracket, /// ] rParen, /// $(RPAREN) semicolon, /// ; shiftLeft, /// << shiftLeftEqual, /// <<= shiftRight, /// >> shiftRightEqual, /// >>= slice, /// .. star, /// * ternary, /// ? tilde, /// ~ unordered, /// !<>= unsignedShiftRight, /// >>> unsignedShiftRightEqual, /// >>>= vararg, /// ... xor, /// ^ xorEqual, /// ^= bool_, /// $(D_KEYWORD bool) byte_, /// $(D_KEYWORD byte) cdouble_, /// $(D_KEYWORD cdouble) cent_, /// $(D_KEYWORD cent) cfloat_, /// $(D_KEYWORD cfloat) char_, /// $(D_KEYWORD char) creal_, /// $(D_KEYWORD creal) dchar_, /// $(D_KEYWORD dchar) double_, /// $(D_KEYWORD double) float_, /// $(D_KEYWORD float) idouble_, /// $(D_KEYWORD idouble) ifloat_, /// $(D_KEYWORD ifloat) int_, /// $(D_KEYWORD int) ireal_, /// $(D_KEYWORD ireal) long_, /// $(D_KEYWORD long) real_, /// $(D_KEYWORD real) short_, /// $(D_KEYWORD short) ubyte_, /// $(D_KEYWORD ubyte) ucent_, /// $(D_KEYWORD ucent) uint_, /// $(D_KEYWORD uint) ulong_, /// $(D_KEYWORD ulong) ushort_, /// $(D_KEYWORD ushort) void_, /// $(D_KEYWORD void) wchar_, /// $(D_KEYWORD wchar) align_, /// $(D_KEYWORD align) deprecated_, /// $(D_KEYWORD deprecated) extern_, /// $(D_KEYWORD extern) pragma_, /// $(D_KEYWORD pragma) export_, /// $(D_KEYWORD export) package_, /// $(D_KEYWORD package) private_, /// $(D_KEYWORD private) protected_, /// $(D_KEYWORD protected) public_, /// $(D_KEYWORD public) abstract_, /// $(D_KEYWORD abstract) auto_, /// $(D_KEYWORD auto) const_, /// $(D_KEYWORD const) final_, /// $(D_KEYWORD final) gshared, /// $(D_KEYWORD __gshared) immutable_, /// $(D_KEYWORD immutable) inout_, /// $(D_KEYWORD inout) scope_, /// $(D_KEYWORD scope) shared_, /// $(D_KEYWORD shared) static_, /// $(D_KEYWORD static) synchronized_, /// $(D_KEYWORD synchronized) alias_, /// $(D_KEYWORD alias) asm_, /// $(D_KEYWORD asm) assert_, /// $(D_KEYWORD assert) body_, /// $(D_KEYWORD body) break_, /// $(D_KEYWORD break) case_, /// $(D_KEYWORD case) cast_, /// $(D_KEYWORD cast) catch_, /// $(D_KEYWORD catch) class_, /// $(D_KEYWORD class) continue_, /// $(D_KEYWORD continue) debug_, /// $(D_KEYWORD debug) default_, /// $(D_KEYWORD default) delegate_, /// $(D_KEYWORD delegate) function_, /// $(D_KEYWORD function) delete_, /// $(D_KEYWORD delete) do_, /// $(D_KEYWORD do) else_, /// $(D_KEYWORD else) enum_, /// $(D_KEYWORD enum) false_, /// $(D_KEYWORD false) finally_, /// $(D_KEYWORD finally) foreach_, /// $(D_KEYWORD foreach) foreach_reverse_, /// $(D_KEYWORD foreach_reverse) for_, /// $(D_KEYWORD for) goto_, /// $(D_KEYWORD goto) if_, /// $(D_KEYWORD if) import_, /// $(D_KEYWORD import) in_, /// $(D_KEYWORD in) interface_, /// $(D_KEYWORD interface) invariant_, /// $(D_KEYWORD invariant) is_, /// $(D_KEYWORD is) lazy_, /// $(D_KEYWORD lazy) macro_, /// $(D_KEYWORD macro) mixin_, /// $(D_KEYWORD mixin) module_, /// $(D_KEYWORD module) new_, /// $(D_KEYWORD new) nothrow_, /// $(D_KEYWORD nothrow) null_, /// $(D_KEYWORD null) out_, /// $(D_KEYWORD out) override_, /// $(D_KEYWORD override) pure_, /// $(D_KEYWORD pure) ref_, /// $(D_KEYWORD ref) return_, /// $(D_KEYWORD return) struct_, /// $(D_KEYWORD struct) super_, /// $(D_KEYWORD super) switch_, /// $(D_KEYWORD switch) template_, /// $(D_KEYWORD template) this_, /// $(D_KEYWORD this) throw_, /// $(D_KEYWORD throw) true_, /// $(D_KEYWORD true) try_, /// $(D_KEYWORD try) typedef_, /// $(D_KEYWORD typedef) typeid_, /// $(D_KEYWORD typeid) typeof_, /// $(D_KEYWORD typeof) union_, /// $(D_KEYWORD union) unittest_, /// $(D_KEYWORD unittest) version_, /// $(D_KEYWORD version) volatile_, /// $(D_KEYWORD volatile) while_, /// $(D_KEYWORD while) with_, /// $(D_KEYWORD with) specialDate, /// $(D_KEYWORD ___DATE__) specialEof, /// $(D_KEYWORD ___EOF__) specialTime, /// $(D_KEYWORD ___TIME__) specialTimestamp, /// $(D_KEYWORD ___TIMESTAMP__) specialVendor, /// $(D_KEYWORD ___VENDOR__) specialVersion, /// $(D_KEYWORD ___VERSION__) specialFile, /// $(D_KEYWORD ___FILE__) specialLine, /// $(D_KEYWORD ___LINE__) specialModule, /// $(D_KEYWORD ___MODULE__) specialFunction, /// $(D_KEYWORD ___FUNCTION__) specialPrettyFunction, /// $(D_KEYWORD ___PRETTY_FUNCTION__) specialTokenSequence, /// #line 10 "file.d" comment, /// $(D_COMMENT /** comment */) or $(D_COMMENT // comment) or $(D_COMMENT ///comment) identifier, /// anything else scriptLine, /// Line at the beginning of source file that starts from #! traits, /// $(D_KEYWORD ___traits) parameters, /// $(D_KEYWORD ___parameters) vector, /// $(D_KEYWORD ___vector) whitespace, /// whitespace doubleLiteral, /// 123.456 floatLiteral, /// 123.456f or 0x123_45p-3 idoubleLiteral, /// 123.456i ifloatLiteral, /// 123.456fi intLiteral, /// 123 or 0b1101010101 longLiteral, /// 123L realLiteral, /// 123.456L irealLiteral, /// 123.456Li uintLiteral, /// 123u ulongLiteral, /// 123uL characterLiteral, /// 'a' dstringLiteral, /// $(D_STRING "32-bit string"d) stringLiteral, /// $(D_STRING "an 8-bit string") wstringLiteral, /// $(D_STRING "16-bit string"w) } /** * Look up a token's string representation by its type. * Params: * type = the token type * Returns: a string representing the token, or null for token types such as * identifier or integer literal whose string representations vary */ pure string getTokenValue(const TokenType type) { return tokenValues[type]; } // Implementation details follow private: // For now a private helper that is tailored to the way lexer works // hides away forwardness of range by buffering // random-access version is a strightforward thin wrapping // ATM it is byte-oriented private struct LexSource(R) if(isForwardRange!R && !isRandomAccessRange!R) { bool empty() const { return _empty; } auto ref front() const { return accum[accumIdx]; } auto ref peek() const in { assert (accumIdx + 1 < accum.length); } body { return accum[accumIdx + 1]; } void popFront() { ++_index; range.popFront(); // if that was last byte // just advance so that open-righted slice just works accumIdx = (accumIdx+1) & mask; if(range.empty) { _empty = true; return; } if(accumIdx == savedAccumIdx) { // and move stuff around auto oldLen = accum.length; auto toCopy = oldLen - accumIdx; accum.length *= 2; // keep pow of 2 // copy starting with last item copy(retro(accum[accumIdx..oldLen]), retro(accum[$-toCopy..$])); savedAccumIdx = accum.length - toCopy; } accum[accumIdx] = range.front; } auto save() { typeof(this) copy = this; copy.range = range.save; // sadly need to dup circular buffer, as it overwrites items copy.accum = copy.accum.dup; return copy; } // mark a position to slice from later on size_t mark() { savedAccumIdx = accumIdx; return accumIdx; } // slice to current position from previously marked position auto slice() @property { // it's an open right range as usual return CircularRange(accum, savedAccumIdx, accumIdx); } size_t index() const @property { return _index; } private: this(R src, size_t bufferSize) { range = src; assert(bufferSize > 0); assert((bufferSize & (bufferSize-1)) == 0); //is power of 2 accum = new ubyte[bufferSize]; if(range.empty) _empty = true; else accum[accumIdx] = range.front; // load front } // a true RA-range of ubyte struct CircularRange { this(ubyte[] buf, size_t s, size_t e) { assert((buffer.length & (buffer.length-1)) == 0); buffer = buf; start = s; end = e; } //Forward range primitives @property bool empty() const { return start == end; } @property auto ref front() const { return buffer[start]; } void popFront() { start = (start + 1) & mask; } @property auto save() { return this; } //Backwards is a bit slower, but should be rarely used (if at all) @property ref back(){ return buffer[(end-1) & mask]; } void popBack() { end = (end - 1) & mask; } // RA range primitives ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; } @property size_t length() { return end < start ? end + buffer.length -start : end - start; } alias length opDollar; auto opSlice(size_t newStart, size_t newEnd) { size_t maskedStart = (start+newStart) & mask; size_t maskedEnd = (start+newEnd) & mask; return typeof(this)(buffer, maskedStart, maskedEnd); } // @@@bug fwd-ref in ldc0.10 (if placed above previous one) auto opSlice(){ return opSlice(0, length); } private: @property auto mask(){ return buffer.length-1; } size_t start, end; ubyte[] buffer; } @property auto mask(){ return accum.length-1; } R range; bool _empty; ubyte[] accum; // accumulator buffer for non-RA ranges size_t savedAccumIdx; size_t accumIdx; // current index in accumulator size_t _index; // index of current element in original range } // TODO: make sure it's RandomAccess later /*static assert(isRandomAccessRange!( LexSource!(typeof(filter!"true"(cast(ubyte[])null))) .CircularRange) );*/ //trivial pass-through for RA ranges private struct LexSource(R) if(isRandomAccessRange!R) { bool empty() const @property { return cur >= range.length; } bool canPeek() const { return cur + 1 < range.length; } auto ref front() const @property { return range[cur]; } void popFront(){ cur++; } auto ref peek() const in { assert (canPeek()); } body { return range[cur + 1]; } auto save() { typeof(this) copy = this; copy.range = range.save; return copy; } auto mark() { saved = cur; } // use the underliying range slicing capability auto slice() @property { return range[saved..cur]; } size_t index() const @property { return cur; } private: this(R src) { range = src; } size_t cur, saved; R range; } auto lexerSource(Range)(Range range, size_t bufSize=8) if(isForwardRange!Range && !isRandomAccessRange!Range && is(ElementType!Range : const(ubyte))) { return LexSource!(Range)(range, bufSize); } auto lexerSource(Range)(Range range) if(isRandomAccessRange!Range && is(ElementType!Range : const(ubyte))) { return LexSource!(Range)(range); } unittest { // test the basic functionality of a "mark-slice" range import std.string, std.stdio; static void test_hello(T)(T lexs) { assert(lexs.front == 'H'); lexs.popFront(); assert(lexs.front == 'e'); foreach(i; 0..2) { auto saved = lexs.save; lexs.mark(); assert(lexs.slice.equal("")); lexs.popFront(); assert(lexs.slice.equal("e"), text(cast(char)lexs.front)); lexs.popFrontN(4); auto bytes = lexs.slice.map!"cast(char)a".array(); assert(bytes.equal("ello,"), bytes.to!string); lexs.mark(); assert(lexs.slice.equal("")); assert(lexs.front == 'w'); lexs.popFrontN(6); assert(lexs.empty); auto s = lexs.slice(); auto msg = s.save.map!"cast(char)a".array; assert(s[].equal("world!"), msg); assert(s[2..$-1].equal("rld"), msg); assert(s[0] == 'w' && s[$-1] == '!'); s.popFront(); assert(s.front == 'o' && s.back == '!'); s.popBack(); assert(s.front == 'o' && s.back == 'd'); //restore and repeat again lexs = saved; } } static void test_empty(T)(T lexs) { assert(lexs.empty); lexs.mark(); assert(lexs.slice().equal("")); } auto fwdLex = lexerSource( "Hello, world!" .representation .filter!"a != ' '", 16 // and the one that is more then enough ); test_hello(fwdLex); fwdLex = lexerSource( "Hello, world!" .representation .filter!"a != ' '", 1 // try the smallest initial buffer ); test_hello(fwdLex); fwdLex = lexerSource("".representation.filter!"a != ' '"); auto raLex = lexerSource("".representation); test_empty(raLex); test_empty(fwdLex); raLex = lexerSource("Hello,world!".representation); test_hello(raLex); } // uses auto-detection for pure, safe nothrow bool isRangeEoF(R)(ref R range) { return range.empty || range.front == 0 || range.front == 0x1a; } // Lookup table for token string representations immutable(string[TokenType.max + 1]) tokenValues = [ null, "=", "@", "&", "&=", "|", "|=", "~=", ":", ",", "--", "/", "/=", "$", ".", "==", "=>", ">", ">=", "#", "++", "{", "[", "<", "<=", "<>=", "<>", "&&", "||", "(", "-", "-=", "%", "%=", "*=", "!", "!=", "!>", "!>=", "!<", "!<=", "!<>", "+", "+=", "^^", "^^=", "}", "]", ")", ";", "<<", "<<=", ">>", ">>=", "..", "*", "?", "~", "!<>=", ">>>", ">>>=", "...", "^", "^=", "bool", "byte", "cdouble", "cent", "cfloat", "char", "creal", "dchar", "double", "float", "idouble", "ifloat", "int", "ireal", "long", "real", "short", "ubyte", "ucent", "uint", "ulong", "ushort", "void", "wchar", "align", "deprecated", "extern", "pragma", "export", "package", "private", "protected", "public", "abstract", "auto", "const", "final", "__gshared", "immutable", "inout", "scope", "shared", "static", "synchronized", "alias", "asm", "assert", "body", "break", "case", "cast", "catch", "class", "continue", "debug", "default", "delegate", "function", "delete", "do", "else", "enum", "false", "finally", "foreach", "foreach_reverse", "for", "goto", "if", "import", "in", "interface", "invariant", "is", "lazy", "macro", "mixin", "module", "new", "nothrow", "null", "out", "override", "pure", "ref", "return", "struct", "super", "switch", "template", "this", "throw", "true", "try", "typedef", "typeid", "typeof", "union", "unittest", "version", "volatile", "while", "with", "__DATE__", "__EOF__", "__TIME__", "__TIMESTAMP__", "__VENDOR__", "__VERSION__", "__FILE__", "__LINE__", "__MODULE__", "__FUNCTION__", "__PRETTY_FUNCTION__", null, null, null, null, "__traits", "__parameters", "__vector", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, ]; template tokenValue(TokenType val) { enum tokenValue = getTokenValue(val); } private pure bool isNewline(ubyte ch) { return ch == '\n' || ch == '\r'; } pure TokenType lookupTokenType(R)(R input) { switch(input.length) { case 2: switch (input[0]) { case 'd': if (input[1] == 'o') return TokenType.do_; else break; case 'i': if (input[1] == 'f') return TokenType.if_; else if (input[1] == 'n') return TokenType.in_; else if (input[1] == 's') return TokenType.is_; else break; default: break; } break; case 3: switch (input[0]) { case 'a': if (input[1..$].equal("sm")) return TokenType.asm_; else break; case 'f': if (input[1..$].equal("or")) return TokenType.for_; else break; case 'i': if (input[1..$].equal("nt")) return TokenType.int_; else break; case 'n': if (input[1..$].equal("ew")) return TokenType.new_; else break; case 'o': if (input[1..$].equal("ut")) return TokenType.out_; else break; case 'r': if (input[1..$].equal("ef")) return TokenType.ref_; else break; case 't': if (input[1..$].equal("ry")) return TokenType.try_; else break; default: break; } break; case 4: switch (input[0]) { case 'a': if (input[1..$].equal("uto")) return TokenType.auto_; else break; case 'b': if (input[1..$].equal("ody")) return TokenType.body_; else if (input[1..$].equal("ool")) return TokenType.bool_; else if (input[1..$].equal("yte")) return TokenType.byte_; else break; case 'c': if (input[1..$].equal("ase")) return TokenType.case_; else if (input[1..$].equal("ast")) return TokenType.cast_; else if (input[1..$].equal("ent")) return TokenType.cent_; else if (input[1..$].equal("har")) return TokenType.char_; else break; case 'e': if (input[1..$].equal("lse")) return TokenType.else_; else if (input[1..$].equal("num")) return TokenType.enum_; else break; case 'g': if (input[1..$].equal("oto")) return TokenType.goto_; else break; case 'l': if (input[1..$].equal("azy")) return TokenType.lazy_; else if (input[1..$].equal("ong")) return TokenType.long_; else break; case 'n': if (input[1..$].equal("ull")) return TokenType.null_; else break; case 'p': if (input[1..$].equal("ure")) return TokenType.pure_; else break; case 'r': if (input[1..$].equal("eal")) return TokenType.real_; else break; case 't': if (input[1..$].equal("his")) return TokenType.this_; else if (input[1..$].equal("rue")) return TokenType.true_; else break; case 'u': if (input[1..$].equal("int")) return TokenType.uint_; else break; case 'v': if (input[1..$].equal("oid")) return TokenType.void_; else break; case 'w': if (input[1..$].equal("ith")) return TokenType.with_; else break; default: break; } break; case 5: switch (input[0]) { case 'a': if (input[1..$].equal("lias")) return TokenType.alias_; else if (input[1..$].equal("lign")) return TokenType.align_; else break; case 'b': if (input[1..$].equal("reak")) return TokenType.break_; else break; case 'c': if (input[1..$].equal("atch")) return TokenType.catch_; else if (input[1..$].equal("lass")) return TokenType.class_; else if (input[1..$].equal("onst")) return TokenType.const_; else if (input[1..$].equal("real")) return TokenType.creal_; else break; case 'd': if (input[1..$].equal("char")) return TokenType.dchar_; else if (input[1..$].equal("ebug")) return TokenType.debug_; else break; case 'f': if (input[1..$].equal("alse")) return TokenType.false_; else if (input[1..$].equal("inal")) return TokenType.final_; else if (input[1..$].equal("loat")) return TokenType.float_; else break; case 'i': if (input[1..$].equal("nout")) return TokenType.inout_; else if (input[1..$].equal("real")) return TokenType.ireal_; else break; case 'm': if (input[1..$].equal("acro")) return TokenType.macro_; else if (input[1..$].equal("ixin")) return TokenType.mixin_; else break; case 's': if (input[1..$].equal("cope")) return TokenType.scope_; else if (input[1..$].equal("hort")) return TokenType.short_; else if (input[1..$].equal("uper")) return TokenType.super_; else break; case 't': if (input[1..$].equal("hrow")) return TokenType.throw_; else break; case 'u': if (input[1..$].equal("byte")) return TokenType.ubyte_; else if (input[1..$].equal("cent")) return TokenType.ucent_; else if (input[1..$].equal("long")) return TokenType.ulong_; else if (input[1..$].equal("nion")) return TokenType.union_; else break; case 'w': if (input[1..$].equal("char")) return TokenType.wchar_; else if (input[1..$].equal("hile")) return TokenType.while_; else break; default: break; } break; case 6: switch (input[0]) { case 'a': if (input[1..$].equal("ssert")) return TokenType.assert_; else break; case 'c': if (input[1..$].equal("float")) return TokenType.cfloat_; else break; case 'd': if (input[1..$].equal("elete")) return TokenType.delete_; else if (input[1..$].equal("ouble")) return TokenType.double_; else break; case 'e': if (input[1..$].equal("xport")) return TokenType.export_; else if (input[1..$].equal("xtern")) return TokenType.extern_; else break; case 'i': if (input[1..$].equal("float")) return TokenType.ifloat_; else if (input[1..$].equal("mport")) return TokenType.import_; else break; case 'm': if (input[1..$].equal("odule")) return TokenType.module_; else break; case 'p': if (input[1..$].equal("ragma")) return TokenType.pragma_; else if (input[1..$].equal("ublic")) return TokenType.public_; else break; case 'r': if (input[1..$].equal("eturn")) return TokenType.return_; else break; case 's': if (input[1..$].equal("hared")) return TokenType.shared_; else if (input[1..$].equal("tatic")) return TokenType.static_; else if (input[1..$].equal("truct")) return TokenType.struct_; else if (input[1..$].equal("witch")) return TokenType.switch_; else break; case 't': if (input[1..$].equal("ypeid")) return TokenType.typeid_; else if (input[1..$].equal("ypeof")) return TokenType.typeof_; else break; case 'u': if (input[1..$].equal("short")) return TokenType.ushort_; else break; default: break; } break; case 7: switch (input[0]) { case '_': if (input[1..$].equal("_EOF__")) return TokenType.specialEof; else break; case 'c': if (input[1..$].equal("double")) return TokenType.cdouble_; else break; case 'd': if (input[1..$].equal("efault")) return TokenType.default_; else break; case 'f': if (input[1..$].equal("inally")) return TokenType.finally_; else if (input[1..$].equal("oreach")) return TokenType.foreach_; else break; case 'i': if (input[1..$].equal("double")) return TokenType.idouble_; else break; case 'n': if (input[1..$].equal("othrow")) return TokenType.nothrow_; else break; case 'p': if (input[1..$].equal("ackage")) return TokenType.package_; else if (input[1..$].equal("rivate")) return TokenType.private_; else break; case 't': if (input[1..$].equal("ypedef")) return TokenType.typedef_; else break; case 'v': if (input[1..$].equal("ersion")) return TokenType.version_; else break; default: break; } break; case 8: switch (input[0]) { case '_': if (input[1..$].equal("_DATE__")) return TokenType.specialDate; else if (input[1..$].equal("_FILE__")) return TokenType.specialFile; else if (input[1..$].equal("_LINE__")) return TokenType.specialLine; else if (input[1..$].equal("_vector")) return TokenType.vector; else if (input[1..$].equal("_TIME__")) return TokenType.specialTime; else if (input[1..$].equal("_traits")) return TokenType.traits; else break; case 'a': if (input[1..$].equal("bstract")) return TokenType.abstract_; else break; case 'c': if (input[1..$].equal("ontinue")) return TokenType.continue_; else break; case 'd': if (input[1..$].equal("elegate")) return TokenType.delegate_; else break; case 'f': if (input[1..$].equal("unction")) return TokenType.function_; else break; case 'o': if (input[1..$].equal("verride")) return TokenType.override_; else break; case 't': if (input[1..$].equal("emplate")) return TokenType.template_; else break; case 'u': if (input[1..$].equal("nittest")) return TokenType.unittest_; else break; case 'v': if (input[1..$].equal("olatile")) return TokenType.volatile_; else break; default: break; } break; case 9: switch (input[0]) { case '_': if (input[1..$].equal("_gshared")) return TokenType.gshared; else break; case 'i': if (input[1..$].equal("mmutable")) return TokenType.immutable_; else if (input[1..$].equal("nterface")) return TokenType.interface_; else if (input[1..$].equal("nvariant")) return TokenType.invariant_; else break; case 'p': if (input[1..$].equal("rotected")) return TokenType.protected_; else break; default: break; } break; case 10: switch (input[0]) { case 'd': if (input[1..$].equal("eprecated")) return TokenType.deprecated_; else break; case '_': if (input[1..$].equal("_VENDOR__")) return TokenType.specialVendor; else if (input[1..$].equal("_MODULE__")) return TokenType.specialModule; else break; default: break; } break; case 11: if (input.equal("__VERSION__")) return TokenType.specialVersion; break; case 12: switch (input[0]) { case 's': if (input[1..$].equal("ynchronized")) return TokenType.synchronized_; else break; case '_': if (input[1..$].equal("_FUNCTION__")) return TokenType.specialFunction; else if (input[1..$].equal("_parameters")) return TokenType.parameters; else break; default: break; } break; case 13: if (input.equal("__TIMESTAMP__")) return TokenType.specialTimestamp; break; case 15: if (input.equal("foreach_reverse")) return TokenType.foreach_reverse_; break; case 19: if (input.equal("__PRETTY_FUNCTION__")) return TokenType.specialPrettyFunction; break; default: break; } return TokenType.identifier; } class Trie(K, V) if (isInputRange!K): TrieNode!(K, V) { /** * Adds the given value to the trie with the given key */ void add(K key, V value) pure { TrieNode!(K,V) current = this; foreach(keyPart; key) { if ((keyPart in current.children) is null) { auto node = new TrieNode!(K, V); current.children[keyPart] = node; current = node; } else current = current.children[keyPart]; } current.value = value; } } class TrieNode(K, V) if (isInputRange!K) { V value; TrieNode!(K,V)[ElementType!K] children; } string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString) { string caseStatement = ""; foreach(dchar k, TrieNode!(K,V) v; node.children) { caseStatement ~= indentString; caseStatement ~= "case '"; caseStatement ~= k; caseStatement ~= "':\n"; caseStatement ~= indentString; caseStatement ~= "\tnextCharNonLF();\n"; if (v.children.length > 0) { caseStatement ~= indentString; caseStatement ~= "\tif (isEoF())\n"; caseStatement ~= indentString; caseStatement ~= "\t{\n"; caseStatement ~= indentString; caseStatement ~= "\t\tcurrent.value = tokenValue!("~node.children[k].value~");\n"; caseStatement ~= indentString; caseStatement ~= "\t\tcurrent.type = " ~ node.children[k].value; caseStatement ~= ";\n"; caseStatement ~= indentString; caseStatement ~= "\t\treturn;\n"; caseStatement ~= indentString; caseStatement ~= "\t}\n"; caseStatement ~= indentString; caseStatement ~= "\tswitch (src.front)\n"; caseStatement ~= indentString; caseStatement ~= "\t{\n"; caseStatement ~= printCaseStatements(v, indentString ~ "\t"); caseStatement ~= indentString; caseStatement ~= "\tdefault:\n"; caseStatement ~= indentString; caseStatement ~= "\t\tcurrent.type = "; caseStatement ~= v.value; caseStatement ~= ";\n"; caseStatement ~= indentString; caseStatement ~= "\t\tcurrent.value = tokenValue!("~v.value~");\n"; caseStatement ~= indentString; caseStatement ~= "\t\treturn;\n"; caseStatement ~= indentString; caseStatement ~= "\t}\n"; } else { caseStatement ~= indentString; caseStatement ~= "\tcurrent.type = "; caseStatement ~= v.value; caseStatement ~= ";\n"; caseStatement ~= indentString; caseStatement ~= "\tcurrent.value = tokenValue!("~v.value~");\n"; caseStatement ~= indentString; caseStatement ~= "\treturn;\n"; } } return caseStatement; } string generateCaseTrie(string[] args ...) { auto t = new Trie!(string, string); for(int i = 0; i < args.length; i+=2) { t.add(args[i], args[i+1]); } return printCaseStatements(t, ""); } struct StringCache { this(size_t startSize) { assert((startSize & (startSize-1)) == 0); index = new Slot*[startSize]; } string get(R)(R range) if (isRandomAccessRange!R && is(Unqual!(ElementType!R) : const(ubyte))) { uint h = hash(range); uint bucket = h & (index.length-1); Slot *s = index[bucket]; if (s == null) { string str = putIntoCache(range); index[bucket] = allocateSlot(str, h); uniqueSlots++; return str; } while (true) { if(s.hash == h && s.value.equal(range)) return s.value; if(s.next == null) break; s = s.next; } string str = putIntoCache(range); s.next = allocateSlot(str, h); uniqueSlots++; // had at least 1 item in this bucket // and inserted another one - check load factor if (uniqueSlots * loadDenom > index.length * loadQuot) rehash(); return str; } private: static uint hash(R)(R data) { uint hash = 0; foreach (b; data) { hash ^= sbox[b]; hash *= 3; } return hash; } struct Slot { string value; Slot* next; uint hash; } void printLoadFactor() { size_t cnt = 0, maxChain = 0; foreach(Slot* s; index) { size_t chain = 0; for(Slot* p = s; p; p = p.next) { chain++; } maxChain = max(chain, maxChain); cnt += chain; } import std.stdio; assert(cnt == uniqueSlots); writefln("Load factor: %.3f; max bucket %d", cast(double)cnt/index.length, maxChain); } void rehash() { //writefln("BEFORE (size = %d):", index.length); //printLoadFactor(); size_t oldLen = index.length; index.length *= 2; for (size_t i = 0; i < oldLen; i++) { Slot* cur = index[i], prev; while(cur) { //has extra bit set - move it out if(cur.hash & oldLen) { if(prev == null) { Slot* r = cur; index[i] = cur.next; cur = cur.next; insertIntoBucket(r, i + oldLen); } else { Slot* r = removeLink(cur, prev); insertIntoBucket(r, i + oldLen); } } else { prev = cur; cur = cur.next; } } } //writefln("AFTER (size = %d):", index.length); //printLoadFactor(); } static Slot* removeLink(ref Slot* cur, Slot* prev) { prev.next = cur.next; Slot* r = cur; cur = cur.next; return r; } //insert at front of bucket void insertIntoBucket(Slot* what, size_t bucket) { what.next = null; Slot* p = index[bucket]; what.next = p; index[bucket] = what; } Slot* allocateSlot(string val, uint hash) { auto slice = allocateInCache(Slot.sizeof); auto newSlot = cast(Slot*)slice.ptr; *newSlot = Slot(val, null, hash); return newSlot; } Slot*[] index; size_t uniqueSlots; enum loadQuot = 2, loadDenom = 3; // leave some slack for alloctors/GC meta-data enum chunkSize = 16*1024 - size_t.sizeof*8; ubyte*[] chunkS; size_t next = chunkSize; //TODO: add aligned variant that allocates at word boundary ubyte[] allocateInCache(size_t size) { import core.memory; if(next + size > chunkSize) { // avoid huge allocations if(size> chunkSize/4) { ubyte* p = cast(ubyte*)GC.malloc(size, GC.BlkAttr.NO_SCAN); return p[0..size]; } chunkS ~= cast(ubyte*)GC.malloc(chunkSize, GC.BlkAttr.NO_SCAN); next = 0; } auto slice = chunkS[$-1][next..next+size]; next += size; return slice; } string putIntoCache(R)(R data) { auto slice = allocateInCache(data.length); slice[] = data[]; return cast(string)slice; } } immutable uint[] sbox = [ 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, 0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300, 0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991, 0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8, 0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8, 0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7, 0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0, 0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26, 0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595, 0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB, 0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1, 0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8, 0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03, 0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C, 0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6, 0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B, 0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3, 0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B, 0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A, 0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292, 0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381, 0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3, 0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154, 0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761, 0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1, 0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F, 0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7, 0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE, 0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1, 0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81, 0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F, 0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D, 0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF, 0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0, 0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF, 0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94, 0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414, 0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA, 0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089, 0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F, 0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802, 0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5, 0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74, 0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629, 0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843, 0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27, 0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B, 0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C, 0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F, 0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6, 0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933, 0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F, 0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5, 0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E, 0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2, 0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA, 0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE, 0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64, 0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B, 0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4, 0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41, 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A, ]; unittest { LexerConfig cfg; auto tkr = "void main(){ }".representation.byToken(cfg); assert(tkr.map!"a.value".equal(["void", "main", "(", ")", "{", "}"])); tkr = "1234 54.23232".representation.byToken(cfg); assert(tkr.equal(["1234", "54.23232"])); auto str = r"0 0. .0 1 0x3 0b102 007"; cfg.iterStyle = IterationStyle.everything; tkr = str.representation.byToken(cfg); assert(tkr.map!"a.value".equal(["0", " ", "0.", " ", ".0", " ", "1", " ", "0x3", " ", "0b10", "2", " ", "007"] ), text(tkr.map!"a.value")); } unittest { import std.stdio; auto source = cast(ubyte[]) ( " bool byte cdouble cent cfloat char creal dchar double float function" ~ " idouble ifloat int ireal long real short ubyte ucent uint ulong" ~ " ushort void wchar align deprecated extern pragma export package private" ~ " protected public abstract auto const final __gshared immutable inout" ~ " scope shared static synchronized alias asm assert body break case" ~ " cast catch class continue debug default delegate delete do else" ~ " enum false finally foreach foreach_reverse for goto if import in" ~ " interface invariant is lazy macro mixin module new nothrow null" ~ " out override pure ref return struct super switch template this" ~ " throw true try typedef typeid typeof union unittest version volatile" ~ " while with __traits __parameters __vector __VENDOR__ __MODULE__" ~ " __VERSION__ __TIMESTAMP__ __PRETTY_FUNCTION__"); auto expected = ["bool", "byte", "cdouble", "cent", "cfloat", "char", "creal", "dchar", "double", "float", "function", "idouble", "ifloat", "int", "ireal", "long", "real", "short", "ubyte", "ucent", "uint", "ulong", "ushort", "void", "wchar", "align", "deprecated", "extern", "pragma", "export", "package", "private", "protected", "public", "abstract", "auto", "const", "final", "__gshared", "immutable", "inout", "scope", "shared", "static", "synchronized", "alias", "asm", "assert", "body", "break", "case", "cast", "catch", "class", "continue", "debug", "default", "delegate", "delete", "do", "else", "enum", "false", "finally", "foreach", "foreach_reverse", "for", "goto", "if", "import", "in", "interface", "invariant", "is", "lazy","macro", "mixin", "module", "new", "nothrow", "null", "out", "override", "pure", "ref", "return", "struct", "super", "switch", "template", "this", "throw", "true", "try", "typedef", "typeid", "typeof", "union", "unittest", "version", "volatile", "while", "with", "__traits", "__parameters", "__vector", "__VENDOR__", "__MODULE__", "__VERSION__", "__TIMESTAMP__", "__PRETTY_FUNCTION__"]; LexerConfig config; config.tokenStyle = TokenStyle.doNotReplaceSpecial; auto tokens = byToken(source, config); // writeln(tokens.map!"a.value"().array()); assert (equal(map!"a.value"(tokens), expected)); } unittest { auto source = cast(ubyte[]) ("=@& &=| |=~=:,--/ /=$.===>> >=++{[< <=<>=<>&&||(- -=%%=*=!!=!>!>=!+ +=^^^^=}]);<< <<=>> >>=..*?~!<>=>>>>>>=...^ ^="); auto expected = ["=", "@", "&", "&=", "|", "|=", "~=", ":", ",", "--", "/", "/=", "$", ".", "==", "=>", ">", ">=", "++", "{", "[", "<", "<=", "<>=", "<>", "&&", "||", "(", "-", "-=", "%", "%=", "*=", "!", "!=", "!>", "!>=", "!<", "!<=", "!<>", "+", "+=", "^^", "^^=", "}", "]", ")", ";", "<<", "<<=", ">>", ">>=", "..", "*", "?", "~", "!<>=", ">>>", ">>>=", "...", "^", "^="]; LexerConfig config; auto tokens = byToken(source, config); //writeln(tokens.map!"a.value"().array()); assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); } unittest { auto source = cast(ubyte[]) (` 1 1.2 //comment 1.2f 1u 1uL 0b011 0b1uu 0b1 /+abc/+def+/+/0x11001uL 123e1L 123e+1f 123e-1i 15e++ 4ea 1.2u 4i 1337L 4.2L 1..2 4.3.5.8 0xabc 0xabcp4 0x1P-10 0x40u 0x29L 0x4Lu 0xdeadbeef `); auto expected = ["1", "1.2", "1.2f", "1u", "1uL", "0b011", "0b1u", "u", "0b1", "0x11001uL", "123e1L", "123e+1f", "123e-1i", "15e+", "+", "4e", "a", "1.2", "u", "4i", "1337L", "4.2L", "1", "..", "2", "4.3", ".5", ".8", "0xabc", "0xabcp4", "0x1P-10", "0x40u", "0x29L", "0x4Lu", "0xdeadbeef"]; int errCount = 0; void errorFunction(string file, size_t index, uint line, uint col, string msg) { ++errCount; } LexerConfig config; config.errorFunc = &errorFunction; auto tokens = byToken(source, config); //writeln(tokens.map!"a.value"()); assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); assert (errCount == 2); } unittest { auto source = cast(ubyte[]) ("int #line 4\n double q{abcde (a + b) == 0} '\\u0020' q\"HEREDOC\r\nabcde\r\nHEREDOC\""); LexerConfig config; auto tokens = byToken(source, config); assert (tokens.front.line == 1); assert (tokens.moveFront() == TokenType.int_); assert (tokens.front.line == 4); assert (isBasicType(tokens.front)); assert (tokens.front.value == "double"); tokens.popFront(); assert (tokens.front.value == "abcde (a + b) == 0", tokens.front.value); assert (isStringLiteral(tokens.front), tokens.front.type.text()); tokens.popFront(); assert (tokens.front.value == " "); assert (tokens.front.type == TokenType.characterLiteral); tokens.popFront(); assert (tokens.front.value == "abcde\r\n", "[%s]".format(tokens.front.value)); } unittest { auto source = cast(ubyte[]) "q{(a & 1) == 0} q\"/foo]/\" q\"HEREDOC\r\nabcde\r\nHEREDOC\""; LexerConfig config; config.tokenStyle = TokenStyle.includeQuotes; auto tokens = byToken(source, config); assert (tokens.front.value == "q{(a & 1) == 0}", tokens.front.value); tokens.popFront(); assert (tokens.front.value == "q\"/foo]/\"", tokens.front.value); tokens.popFront(); assert (tokens.front.value == "q\"HEREDOC\r\nabcde\r\nHEREDOC\"", tokens.front.value); } unittest { auto source = cast(ubyte[]) (`"string`); int errCount = 0; void errorFunction(string file, size_t index, uint line, uint col, string msg) { ++errCount; } LexerConfig config; config.errorFunc = &errorFunction; auto tokens = byToken(source, config); assert (errCount == 1); } unittest { auto source = cast(ubyte[]) ("import foo"); LexerConfig config; auto tokens = byToken(source, config); Token a = tokens.moveFront(); assert (a.type == TokenType.import_); Token b = tokens.moveFront(); assert (b.type == TokenType.identifier); assert (a != b); assert (a != "foo"); assert (a < b); assert (b == "foo"); assert (b > a); assert (!(a > a)); assert (tokens.empty); } unittest { auto source = cast(ubyte[]) ("import std.stdio; void main(){writeln(\"hello world\");}"); LexerConfig config; auto tokens = byToken(source, config); int tokenCount = 0; foreach (t; tokens) { ++tokenCount; } assert (tokenCount == 16); } //void main(string[] args){}