whitespace

This commit is contained in:
Hackerpilot 2013-03-10 00:41:54 -08:00
parent c1fcef1873
commit dc81410008
1 changed files with 163 additions and 163 deletions

View File

@ -4,7 +4,7 @@
* This module contains a range-based _lexer for the D programming language. * This module contains a range-based _lexer for the D programming language.
* *
* For performance reasons the _lexer contained in this module operates only on * For performance reasons the _lexer contained in this module operates only on
* ASCII and UTF-8 encoded source code. If the use of other encodings is * ASCII or UTF-8 encoded source code. If the use of other encodings is
* desired, the source code must be converted to UTF-8 before passing it to this * desired, the source code must be converted to UTF-8 before passing it to this
* _lexer. * _lexer.
* *
@ -125,60 +125,60 @@ version (unittest) import std.stdio;
public: public:
/** /**
* Represents a D token * Represents a D token
*/ */
struct Token struct Token
{ {
/** /**
* The token type. * The token type.
*/ */
TokenType type; TokenType type;
/** /**
* The representation of the token in the original source code. * The representation of the token in the original source code.
*/ */
string value; string value;
/** /**
* The number of the line the token is on. * The number of the line the token is on.
*/ */
uint line; uint line;
/** /**
* The column number of the start of the token in the original source. * The column number of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/ */
uint column; uint column;
/** /**
* The index of the start of the token in the original source. * The index of the start of the token in the original source.
* $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN)
*/ */
size_t startIndex; size_t startIndex;
/** /**
* Check to see if the token is of the same type and has the same string * Check to see if the token is of the same type and has the same string
* representation as the given token. * representation as the given token.
*/ */
bool opEquals(ref const(Token) other) const bool opEquals(ref const(Token) other) const
{ {
return other.type == type && other.value == value; return other.type == type && other.value == value;
} }
/** /**
* Checks to see if the token's string representation is equal to the given * Checks to see if the token's string representation is equal to the given
* string. * string.
*/ */
bool opEquals(string value) const { return this.value == value; } bool opEquals(string value) const { return this.value == value; }
/** /**
* Checks to see if the token is of the given type. * Checks to see if the token is of the given type.
*/ */
bool opEquals(TokenType type) const { return type == type; } bool opEquals(TokenType type) const { return type == type; }
/** /**
* Comparison operator orders tokens by start index. * Comparison operator orders tokens by start index.
*/ */
int opCmp(ref const(Token) other) const int opCmp(ref const(Token) other) const
{ {
if (startIndex < other.startIndex) return -1; if (startIndex < other.startIndex) return -1;
@ -188,9 +188,9 @@ struct Token
} }
/** /**
* Configure the behavior of the byToken() function. These flags may be * Configure the behavior of the byToken() function. These flags may be
* combined using a bitwise or. * combined using a bitwise or.
*/ */
enum IterationStyle enum IterationStyle
{ {
/// Only include code, not whitespace or comments /// Only include code, not whitespace or comments
@ -208,98 +208,98 @@ enum IterationStyle
} }
/** /**
* Configuration of the token lexing style. These flags may be combined with a * Configuration of the token lexing style. These flags may be combined with a
* bitwise or. * bitwise or.
*/ */
enum TokenStyle : uint enum TokenStyle : uint
{ {
/** /**
* Escape sequences will be replaced with their equivalent characters, * Escape sequences will be replaced with their equivalent characters,
* enclosing quote characters will not be included. Special tokens such as * enclosing quote characters will not be included. Special tokens such as
* __VENDOR__ will be replaced with their equivalent strings. Useful for * __VENDOR__ will be replaced with their equivalent strings. Useful for
* creating a compiler or interpreter. * creating a compiler or interpreter.
*/ */
default_ = 0b0000, default_ = 0b0000,
/** /**
* Escape sequences will not be processed. An escaped quote character will * Escape sequences will not be processed. An escaped quote character will
* not terminate string lexing, but it will not be replaced with the quote * not terminate string lexing, but it will not be replaced with the quote
* character in the token. * character in the token.
*/ */
notEscaped = 0b0001, notEscaped = 0b0001,
/** /**
* Strings will include their opening and closing quote characters as well * Strings will include their opening and closing quote characters as well
* as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will
* include the $(D_STRING 'w') character as well as the opening and closing * include the $(D_STRING 'w') character as well as the opening and closing
* quotes$(RPAREN) * quotes$(RPAREN)
*/ */
includeQuotes = 0b0010, includeQuotes = 0b0010,
/** /**
* Do not replace the value field of the special tokens such as ___DATE__ * Do not replace the value field of the special tokens such as ___DATE__
* with their string equivalents. * with their string equivalents.
*/ */
doNotReplaceSpecial = 0b0100, doNotReplaceSpecial = 0b0100,
/** /**
* Strings will be read exactly as they appeared in the source, including * Strings will be read exactly as they appeared in the source, including
* their opening and closing quote characters. Useful for syntax * their opening and closing quote characters. Useful for syntax
* highlighting. * highlighting.
*/ */
source = notEscaped | includeQuotes | doNotReplaceSpecial source = notEscaped | includeQuotes | doNotReplaceSpecial
} }
/** /**
* Lexer configuration * Lexer configuration
*/ */
struct LexerConfig struct LexerConfig
{ {
/** /**
* Iteration style * Iteration style
*/ */
IterationStyle iterStyle = IterationStyle.codeOnly; IterationStyle iterStyle = IterationStyle.codeOnly;
/** /**
* Token style * Token style
*/ */
TokenStyle tokenStyle = tokenStyle.default_; TokenStyle tokenStyle = tokenStyle.default_;
/** /**
* Replacement for the ___VERSION__ token. Defaults to 100. * Replacement for the ___VERSION__ token. Defaults to 100.
*/ */
uint versionNumber = 100; uint versionNumber = 100;
/** /**
* Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer")
*/ */
string vendorString = "std.d.lexer"; string vendorString = "std.d.lexer";
/** /**
* Name used when creating error messages that are sent to errorFunc. This * Name used when creating error messages that are sent to errorFunc. This
* is needed because the lexer operates on any forwarad range of ASCII * is needed because the lexer operates on any forwarad range of ASCII
* characters or UTF-8 code units and does not know what to call its input * characters or UTF-8 code units and does not know what to call its input
* source. Defaults to the empty string. * source. Defaults to the empty string.
*/ */
string fileName = ""; string fileName = "";
/** /**
* This function is called when an error is encountered during lexing. * This function is called when an error is encountered during lexing.
* Parameters are file name, code uint index, line number, column, * Parameters are file name, code uint index, line number, column,
* and error messsage. * and error messsage.
*/ */
void delegate(string, size_t, uint, uint, string) errorFunc; void delegate(string, size_t, uint, uint, string) errorFunc;
} }
/** /**
* Iterate over the given range of characters by D tokens. * Iterate over the given range of characters by D tokens.
* Params: * Params:
* range = the range of characters * range = the range of characters
* config = the lexer configuration * config = the lexer configuration
* bufferSize = initial size of internal circular buffer * bufferSize = initial size of internal circular buffer
* Returns: * Returns:
* an input range of tokens * an input range of tokens
*/ */
auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024) auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024)
if (isForwardRange!(R) && !isRandomAccessRange!(R) if (isForwardRange!(R) && !isRandomAccessRange!(R)
&& is(ElementType!R : const(ubyte))) && is(ElementType!R : const(ubyte)))
@ -326,22 +326,22 @@ auto byToken(R)(R range, LexerConfig config)
} }
/** /**
* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
*/ */
struct TokenRange(LexSrc) struct TokenRange(LexSrc)
//if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource
{ {
/** /**
* Returns: true if the range is empty * Returns: true if the range is empty
*/ */
bool empty() const @property bool empty() const @property
{ {
return _empty; return _empty;
} }
/** /**
* Returns: the current token * Returns: the current token
*/ */
ref const(Token) front() const @property ref const(Token) front() const @property
{ {
assert(!empty, "trying to get front of an empty token range"); assert(!empty, "trying to get front of an empty token range");
@ -349,8 +349,8 @@ struct TokenRange(LexSrc)
} }
/** /**
* Returns the current token and then removes it from the range * Returns the current token and then removes it from the range
*/ */
Token moveFront() Token moveFront()
{ {
auto r = move(current); auto r = move(current);
@ -359,8 +359,8 @@ struct TokenRange(LexSrc)
} }
/** /**
* Foreach operation * Foreach operation
*/ */
int opApply(int delegate(Token) dg) int opApply(int delegate(Token) dg)
{ {
int result = 0; int result = 0;
@ -375,8 +375,8 @@ struct TokenRange(LexSrc)
} }
/** /**
* Foreach operation * Foreach operation
*/ */
int opApply(int delegate(size_t, Token) dg) int opApply(int delegate(size_t, Token) dg)
{ {
int result = 0; int result = 0;
@ -392,8 +392,8 @@ struct TokenRange(LexSrc)
} }
/** /**
* Removes the current token from the range * Removes the current token from the range
*/ */
void popFront() void popFront()
{ {
advance(); advance();
@ -402,8 +402,8 @@ struct TokenRange(LexSrc)
private: private:
/* /*
* Advances the range to the next token * Advances the range to the next token
*/ */
void advance() void advance()
{ {
L_advance: L_advance:
@ -431,15 +431,15 @@ L_advance:
"=", "TokenType.assign", "=", "TokenType.assign",
"@", "TokenType.at", "@", "TokenType.at",
"&", "TokenType.bitAnd", "&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEquals", "&=", "TokenType.bitAndEqual",
"|", "TokenType.bitOr", "|", "TokenType.bitOr",
"|=", "TokenType.bitOrEquals", "|=", "TokenType.bitOrEqual",
"~=", "TokenType.catEquals", "~=", "TokenType.catEqual",
":", "TokenType.colon", ":", "TokenType.colon",
",", "TokenType.comma", ",", "TokenType.comma",
"--", "TokenType.decrement", "--", "TokenType.decrement",
"$", "TokenType.dollar", "$", "TokenType.dollar",
"==", "TokenType.equals", "==", "TokenType.equal",
"=>", "TokenType.goesTo", "=>", "TokenType.goesTo",
">", "TokenType.greater", ">", "TokenType.greater",
">=", "TokenType.greaterEqual", ">=", "TokenType.greaterEqual",
@ -454,21 +454,21 @@ L_advance:
"||", "TokenType.logicOr", "||", "TokenType.logicOr",
"(", "TokenType.lParen", "(", "TokenType.lParen",
"-", "TokenType.minus", "-", "TokenType.minus",
"-=", "TokenType.minusEquals", "-=", "TokenType.minusEqual",
"%", "TokenType.mod", "%", "TokenType.mod",
"%=", "TokenType.modEquals", "%=", "TokenType.modEqual",
"*=", "TokenType.mulEquals", "*=", "TokenType.mulEqual",
"!", "TokenType.not", "!", "TokenType.not",
"!=", "TokenType.notEquals", "!=", "TokenType.notEqual",
"!>", "TokenType.notGreater", "!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual", "!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess", "!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual", "!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater", "!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus", "+", "TokenType.plus",
"+=", "TokenType.plusEquals", "+=", "TokenType.plusEqual",
"^^", "TokenType.pow", "^^", "TokenType.pow",
"^^=", "TokenType.powEquals", "^^=", "TokenType.powEqual",
"}", "TokenType.rBrace", "}", "TokenType.rBrace",
"]", "TokenType.rBracket", "]", "TokenType.rBracket",
")", "TokenType.rParen", ")", "TokenType.rParen",
@ -484,7 +484,7 @@ L_advance:
">>>", "TokenType.unsignedShiftRight", ">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual", ">>>=", "TokenType.unsignedShiftRightEqual",
"^", "TokenType.xor", "^", "TokenType.xor",
"^=", "TokenType.xorEquals", "^=", "TokenType.xorEqual",
)); ));
case '/': case '/':
nextCharNonLF(); nextCharNonLF();
@ -505,7 +505,7 @@ L_advance:
goto L_advance; // tail-recursion goto L_advance; // tail-recursion
case '=': case '=':
current.type = TokenType.divEquals; current.type = TokenType.divEqual;
current.value = "/="; current.value = "/=";
src.popFront(); src.popFront();
return; return;
@ -1201,7 +1201,7 @@ L_advance:
else if (src.front == quote) else if (src.front == quote)
{ {
nextCharNonLF(); nextCharNonLF();
break; break;
} }
else else
nextChar(); nextChar();
@ -1877,186 +1877,186 @@ L_advance:
} }
/** /**
* Returns: true if the token is an operator * Returns: true if the token is an operator
*/ */
pure nothrow bool isOperator(const TokenType t) pure nothrow bool isOperator(const TokenType t)
{ {
return t >= TokenType.assign && t <= TokenType.xorEquals; return t >= TokenType.assign && t <= TokenType.xorEqual;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isOperator(ref const Token t) pure nothrow bool isOperator(ref const Token t)
{ {
return isOperator(t.type); return isOperator(t.type);
} }
/** /**
* Returns: true if the token is a keyword * Returns: true if the token is a keyword
*/ */
pure nothrow bool isKeyword(const TokenType t) pure nothrow bool isKeyword(const TokenType t)
{ {
return t >= TokenType.bool_ && t <= TokenType.with_; return t >= TokenType.bool_ && t <= TokenType.with_;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isKeyword(ref const Token t) pure nothrow bool isKeyword(ref const Token t)
{ {
return isKeyword(t.type); return isKeyword(t.type);
} }
/** /**
* Returns: true if the token is a built-in type * Returns: true if the token is a built-in type
*/ */
pure nothrow bool isType(const TokenType t) pure nothrow bool isType(const TokenType t)
{ {
return t >= TokenType.bool_ && t <= TokenType.wchar_; return t >= TokenType.bool_ && t <= TokenType.wchar_;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isType(ref const Token t) pure nothrow bool isType(ref const Token t)
{ {
return isType(t.type); return isType(t.type);
} }
/** /**
* Returns: true if the token is an attribute * Returns: true if the token is an attribute
*/ */
pure nothrow bool isAttribute(const TokenType t) pure nothrow bool isAttribute(const TokenType t)
{ {
return t >= TokenType.align_ && t <= TokenType.static_; return t >= TokenType.align_ && t <= TokenType.static_;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isAttribute(ref const Token t) pure nothrow bool isAttribute(ref const Token t)
{ {
return isAttribute(t.type); return isAttribute(t.type);
} }
/** /**
* Returns: true if the token is a protection attribute * Returns: true if the token is a protection attribute
*/ */
pure nothrow bool isProtection(const TokenType t) pure nothrow bool isProtection(const TokenType t)
{ {
return t >= TokenType.export_ && t <= TokenType.public_; return t >= TokenType.export_ && t <= TokenType.public_;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isProtection(ref const Token t) pure nothrow bool isProtection(ref const Token t)
{ {
return isProtection(t.type); return isProtection(t.type);
} }
/** /**
* Returns: true if the token is a compile-time constant such as ___DATE__ * Returns: true if the token is a compile-time constant such as ___DATE__
*/ */
pure nothrow bool isConstant(const TokenType t) pure nothrow bool isConstant(const TokenType t)
{ {
return t >= TokenType.date && t <= TokenType.traits; return t >= TokenType.date && t <= TokenType.traits;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isConstant(ref const Token t) pure nothrow bool isConstant(ref const Token t)
{ {
return isConstant(t.type); return isConstant(t.type);
} }
/** /**
* Returns: true if the token is a string or number literal * Returns: true if the token is a string or number literal
*/ */
pure nothrow bool isLiteral(const TokenType t) pure nothrow bool isLiteral(const TokenType t)
{ {
return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral; return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isLiteral(ref const Token t) pure nothrow bool isLiteral(ref const Token t)
{ {
return isLiteral(t.type); return isLiteral(t.type);
} }
/** /**
* Returns: true if the token is a number literal * Returns: true if the token is a number literal
*/ */
pure nothrow bool isNumberLiteral(const TokenType t) pure nothrow bool isNumberLiteral(const TokenType t)
{ {
return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral; return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isNumberLiteral(ref const Token t) pure nothrow bool isNumberLiteral(ref const Token t)
{ {
return isNumberLiteral(t.type); return isNumberLiteral(t.type);
} }
/** /**
* Returns: true if the token is a string literal * Returns: true if the token is a string literal
*/ */
pure nothrow bool isStringLiteral(const TokenType t) pure nothrow bool isStringLiteral(const TokenType t)
{ {
return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral; return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isStringLiteral(ref const Token t) pure nothrow bool isStringLiteral(ref const Token t)
{ {
return isStringLiteral(t.type); return isStringLiteral(t.type);
} }
/** /**
* Returns: true if the token is whitespace, a commemnt, a special token * Returns: true if the token is whitespace, a commemnt, a special token
* sequence, or an identifier * sequence, or an identifier
*/ */
pure nothrow bool isMisc(const TokenType t) pure nothrow bool isMisc(const TokenType t)
{ {
return t >= TokenType.comment && t <= TokenType.specialTokenSequence; return t >= TokenType.comment && t <= TokenType.specialTokenSequence;
} }
/** /**
* ditto * ditto
*/ */
pure nothrow bool isMisc(ref const Token t) pure nothrow bool isMisc(ref const Token t)
{ {
return isMisc(t.type); return isMisc(t.type);
} }
/** /**
* Listing of all the tokens in the D language. * Listing of all the tokens in the D language.
*/ */
enum TokenType: ushort enum TokenType: ushort
{ {
assign, /// = assign, /// =
at, /// @ at, /// @
bitAnd, /// & bitAnd, /// &
bitAndEquals, /// &= bitAndEqual, /// &=
bitOr, /// | bitOr, /// |
bitOrEquals, /// |= bitOrEqual, /// |=
catEquals, /// ~= catEqual, /// ~=
colon, /// : colon, /// :
comma, /// , comma, /// ,
decrement, /// -- decrement, /// --
div, /// / div, /// /
divEquals, /// /= divEqual, /// /=
dollar, /// $ dollar, /// $
dot, /// . dot, /// .
equals, /// == equal, /// ==
goesTo, /// => goesTo, /// =>
greater, /// > greater, /// >
greaterEqual, /// >= greaterEqual, /// >=
@ -2072,21 +2072,21 @@ enum TokenType: ushort
logicOr, /// || logicOr, /// ||
lParen, /// $(LPAREN) lParen, /// $(LPAREN)
minus, /// - minus, /// -
minusEquals, /// -= minusEqual, /// -=
mod, /// % mod, /// %
modEquals, /// %= modEqual, /// %=
mulEquals, /// *= mulEqual, /// *=
not, /// ! not, /// !
notEquals, /// != notEqual, /// !=
notGreater, /// !> notGreater, /// !>
notGreaterEqual, /// !>= notGreaterEqual, /// !>=
notLess, /// !< notLess, /// !<
notLessEqual, /// !<= notLessEqual, /// !<=
notLessEqualGreater, /// !<> notLessEqualGreater, /// !<>
plus, /// + plus, /// +
plusEquals, /// += plusEqual, /// +=
pow, /// ^^ pow, /// ^^
powEquals, /// ^^= powEqual, /// ^^=
rBrace, /// } rBrace, /// }
rBracket, /// ] rBracket, /// ]
rParen, /// $(RPAREN) rParen, /// $(RPAREN)
@ -2104,7 +2104,7 @@ enum TokenType: ushort
unsignedShiftRightEqual, /// >>>= unsignedShiftRightEqual, /// >>>=
vararg, /// ... vararg, /// ...
xor, /// ^ xor, /// ^
xorEquals, /// ^= xorEqual, /// ^=
bool_, /// $(D_KEYWORD bool) bool_, /// $(D_KEYWORD bool)
byte_, /// $(D_KEYWORD byte) byte_, /// $(D_KEYWORD byte)