String cache improvements

This commit is contained in:
Hackerpilot 2014-01-16 18:46:18 -08:00
parent a3f9be1e12
commit 281b46eea2
4 changed files with 176 additions and 59 deletions

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{ {
string[] tags; string[] tags;
LexerConfig config; LexerConfig config;
StringCache cache; StringCache* cache = new StringCache;
foreach (fileName; fileNames) foreach (fileName; fileNames)
{ {
File f = File(fileName); File f = File(fileName);

9
main.d
View File

@ -93,7 +93,7 @@ int main(string[] args)
return 1; return 1;
} }
StringCache cache; StringCache* cache = new StringCache;
if (tokenDump || highlight) if (tokenDump || highlight)
{ {
@ -111,10 +111,8 @@ int main(string[] args)
} }
else if (tokenDump) else if (tokenDump)
{ {
while (!tokens.empty) foreach (token; tokens)
{ {
auto token = tokens.front();
tokens.popFront();
writeln("«", token.text is null ? str(token.type) : token.text, writeln("«", token.text is null ? str(token.type) : token.text,
"» ", token.index, " ", token.line, " ", token.column, " ", "» ", token.index, " ", token.line, " ", token.column, " ",
token.comment); token.comment);
@ -152,11 +150,14 @@ int main(string[] args)
ulong count; ulong count;
foreach (f; expandArgs(args, recursive)) foreach (f; expandArgs(args, recursive))
{ {
import core.memory;
GC.disable();
auto tokens = byToken!(ubyte[])(readFile(f)); auto tokens = byToken!(ubyte[])(readFile(f));
if (tokenCount) if (tokenCount)
count += printTokenCount(stdout, f, tokens); count += printTokenCount(stdout, f, tokens);
else else
count += printLineCount(stdout, f, tokens); count += printLineCount(stdout, f, tokens);
GC.enable();
} }
writefln("total:\t%d", count); writefln("total:\t%d", count);
} }

View File

@ -50,13 +50,13 @@ private enum dynamicTokens = [
"dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine" "dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine"
]; ];
public alias TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens) IdType; public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
public alias tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens) str; public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
public template tok(string token) public template tok(string token)
{ {
alias TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token) tok; alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
} }
enum extraFields = q{ private enum extraFields = q{
string comment; string comment;
int opCmp(size_t i) const pure nothrow @safe { int opCmp(size_t i) const pure nothrow @safe {
@ -65,7 +65,7 @@ enum extraFields = q{
return 0; return 0;
} }
}; };
public alias stdx.lexer.TokenStructure!(IdType, extraFields) Token; public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
/** /**
* Configure string lexing behavior * Configure string lexing behavior
@ -115,17 +115,17 @@ public struct LexerConfig
public auto byToken(R)(R range) public auto byToken(R)(R range)
{ {
LexerConfig config; LexerConfig config;
StringCache cache; StringCache* cache = new StringCache;
return byToken(range, config, cache); return byToken(range, config, cache);
} }
public auto byToken(R)(R range, StringCache cache) public auto byToken(R)(R range, StringCache* cache)
{ {
LexerConfig config; LexerConfig config;
return DLexer!(R)(range, config, cache); return DLexer!(R)(range, config, cache);
} }
public auto byToken(R)(R range, const LexerConfig config, StringCache cache) public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
{ {
return DLexer!(R)(range, config, cache); return DLexer!(R)(range, config, cache);
} }
@ -437,12 +437,13 @@ public struct DLexer(R)
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens, mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens); dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
private alias typeof(range).Mark Mark; private alias Mark = typeof(range).Mark;
this(R range, const LexerConfig config, StringCache cache) this(R range, const LexerConfig config, StringCache* cache)
{ {
this.range = LexerRange!(typeof(buffer(range)))(buffer(range)); this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
this.config = config; this.config = config;
this.cache = cache;
popFront(); popFront();
} }
@ -1432,8 +1433,8 @@ public struct DLexer(R)
if (c >= '[' && c <= '^') return true; if (c >= '[' && c <= '^') return true;
if (c >= '{' && c <= '~') return true; if (c >= '{' && c <= '~') return true;
if (c == '`') return true; if (c == '`') return true;
// if (c & 0x80 && (range.lookahead(3).startsWith("\u2028") // if (c & 0x80 && (range.lookahead(3) == "\u2028"
// || range.lookahead(3).startsWith("\u2029"))) return true; // || range.lookahead(3) == "\u2029")) return true;
return false; return false;
} }
@ -1452,6 +1453,6 @@ public struct DLexer(R)
} }
StringCache cache; StringCache* cache;
LexerConfig config; LexerConfig config;
} }

View File

@ -20,6 +20,13 @@ import std.math;
import dpick.buffer.buffer; import dpick.buffer.buffer;
import dpick.buffer.traits; import dpick.buffer.traits;
/**
* Template for determining the type used for a token type. Selects the smallest
* unsigned integral type that is able to hold the value
* staticTokens.length + dynamicTokens.length. For example if there are 20
* static tokens, 30 dynamic tokens, and 10 possible default tokens, this
* template will alias itself to ubyte, as 20 + 30 + 10 < ubyte.max.
*/
template TokenIdType(alias staticTokens, alias dynamicTokens, template TokenIdType(alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens) alias possibleDefaultTokens)
{ {
@ -33,6 +40,9 @@ template TokenIdType(alias staticTokens, alias dynamicTokens,
static assert (false); static assert (false);
} }
/**
* Looks up the string representation of the given token type.
*/
string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property
{ {
if (type == 0) if (type == 0)
@ -47,18 +57,41 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
return null; return null;
} }
/**
* Generates the token type identifier for the given symbol. There are two
* special cases:
* $(UL
* $(LI If symbol is "", then the token identifier will be 0)
* $(LI If symbol is "\0", then the token identifier will be the maximum
* valid token type identifier)
* )
* In all cases this template will alias itself to a constant of type IdType.
* Examples:
* ---
* enum string[] staticTokens = ["+", "-", "*", "/"];
* enum string[] dynamicTokens = ["number"];
* enum string[] possibleDefaultTokens = [];
* alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
* template tok(string symbol)
* {
* alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
* possibleDefaultTokens, symbol);
* }
* IdType plus = tok!"+";
* ---
*/
template TokenId(IdType, alias staticTokens, alias dynamicTokens, template TokenId(IdType, alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens, string symbol) alias possibleDefaultTokens, string symbol)
{ {
static if (symbol == "") static if (symbol == "")
{ {
enum id = 0; enum id = 0;
alias id TokenId; alias TokenId = id;
} }
else static if (symbol == "\0") else static if (symbol == "\0")
{ {
enum id = 1 + staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length; enum id = 1 + staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length;
alias id TokenId; alias TokenId = id;
} }
else else
{ {
@ -66,7 +99,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
static if (i >= 0) static if (i >= 0)
{ {
enum id = i + 1; enum id = i + 1;
alias id TokenId; alias TokenId = id;
} }
else else
{ {
@ -75,7 +108,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
{ {
enum id = ii + staticTokens.length + 1; enum id = ii + staticTokens.length + 1;
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
alias id TokenId; alias TokenId = id;
} }
else else
{ {
@ -84,24 +117,43 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
? i + staticTokens.length + possibleDefaultTokens.length + dynamicId + 1 ? i + staticTokens.length + possibleDefaultTokens.length + dynamicId + 1
: -1; : -1;
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
alias id TokenId; alias TokenId = id;
} }
} }
} }
} }
/**
* The token that is returned by the lexer.
* Params:
* IDType = The D type of the "type" token type field.
* extraFields = A string containing D code for any extra fields that should
* be included in the token structure body. This string is passed
* directly to a mixin statement.
*/
struct TokenStructure(IDType, string extraFields = "") struct TokenStructure(IDType, string extraFields = "")
{ {
public:
/**
* == overload for the the token type.
*/
bool opEquals(IDType type) const pure nothrow @safe bool opEquals(IDType type) const pure nothrow @safe
{ {
return this.type == type; return this.type == type;
} }
/**
*
*/
this(IDType type) this(IDType type)
{ {
this.type = type; this.type = type;
} }
/**
*
*/
this(IDType type, string text, size_t line, size_t column, size_t index) this(IDType type, string text, size_t line, size_t column, size_t index)
{ {
this.text = text; this.text = text;
@ -111,11 +163,31 @@ struct TokenStructure(IDType, string extraFields = "")
this.index = index; this.index = index;
} }
/**
*
*/
string text; string text;
/**
*
*/
size_t line; size_t line;
/**
*
*/
size_t column; size_t column;
/**
*
*/
size_t index; size_t index;
/**
*
*/
IDType type; IDType type;
mixin (extraFields); mixin (extraFields);
} }
@ -223,21 +295,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
static string escape(string input) static string escape(string input)
{ {
string rVal; string retVal;
foreach (ubyte c; cast(ubyte[]) input) foreach (ubyte c; cast(ubyte[]) input)
{ {
switch (c) switch (c)
{ {
case '\\': rVal ~= `\\`; break; case '\\': retVal ~= `\\`; break;
case '"': rVal ~= `\"`; break; case '"': retVal ~= `\"`; break;
case '\'': rVal ~= `\'`; break; case '\'': retVal ~= `\'`; break;
case '\t': rVal ~= `\t`; break; case '\t': retVal ~= `\t`; break;
case '\n': rVal ~= `\n`; break; case '\n': retVal ~= `\n`; break;
case '\r': rVal ~= `\r`; break; case '\r': retVal ~= `\r`; break;
default: rVal ~= c; break; default: retVal ~= c; break;
} }
} }
return rVal; return retVal;
} }
Token advance() pure Token advance() pure
@ -262,10 +334,10 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
*/ */
static T[] stupidToArray(R, T = ElementType!R)(R range) static T[] stupidToArray(R, T = ElementType!R)(R range)
{ {
T[] rVal; T[] retVal;
foreach (v; range) foreach (v; range)
rVal ~= v; retVal ~= v;
return rVal; return retVal;
} }
LexerRange!(typeof(buffer(R.init))) range; LexerRange!(typeof(buffer(R.init))) range;
@ -302,20 +374,56 @@ struct LexerRange(BufferType) if (isBuffer!BufferType)
size_t line; size_t line;
} }
/**
* The string cache should be used within lexer implementations for several
* reasons:
* $(UL
* $(LI Reducing memory consumption.)
* $(LI Increasing performance in token comparisons)
* $(LI Correctly creating immutable token text if the lexing source is not
* immutable)
* )
*/
struct StringCache struct StringCache
{ {
public: public:
/**
* Equivalent to calling cache() and get().
* ---
* StringCache cache;
* ubyte[] str = ['a', 'b', 'c'];
* string s = cache.get(cache.cache(str));
* assert(s == "abc");
* ---
*/
string cacheGet(const(ubyte[]) bytes) pure nothrow @safe string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
{ {
return get(cache(bytes)); return get(cache(bytes));
} }
/**
* Caches a string.
* Params: bytes = the string to cache
* Returns: A key that can be used to retrieve the cached string
* Examples:
* ---
* StringCache cache;
* ubyte[] bytes = ['a', 'b', 'c'];
* size_t first = cache.cache(bytes);
* size_t second = cache.cache(bytes);
* assert (first == second);
* ---
*/
size_t cache(const(ubyte)[] bytes) pure nothrow @safe size_t cache(const(ubyte)[] bytes) pure nothrow @safe
in in
{ {
assert (bytes.length > 0); assert (bytes.length > 0);
} }
out (retVal)
{
assert (retVal < items.length);
}
body body
{ {
immutable uint hash = hashBytes(bytes); immutable uint hash = hashBytes(bytes);
@ -325,12 +433,21 @@ public:
return found.index; return found.index;
} }
/**
* Gets a cached string based on its key.
* Params: index = the key
* Returns: the cached string
*/
string get(size_t index) const pure nothrow @safe string get(size_t index) const pure nothrow @safe
in in
{ {
assert (items.length > index); assert (items.length > index);
assert (items[index] !is null); assert (items[index] !is null);
} }
out (retVal)
{
assert (retVal !is null);
}
body body
{ {
return items[index].str; return items[index].str;
@ -345,7 +462,7 @@ private:
item.str = allocate(bytes); item.str = allocate(bytes);
item.index = items.length; item.index = items.length;
items ~= item; items ~= item;
buckets[hash % bucketCount] ~= item; buckets[hash % buckets.length] ~= item;
return item.index; return item.index;
} }
@ -361,9 +478,9 @@ private:
} }
string allocate(const(ubyte)[] bytes) pure nothrow @trusted string allocate(const(ubyte)[] bytes) pure nothrow @trusted
out (rVal) out (retVal)
{ {
assert (rVal == bytes); assert (retVal == bytes);
} }
body body
{ {
@ -391,23 +508,6 @@ private:
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length]; return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
} }
Item*[] items;
Item*[][bucketCount] buckets;
Block[] blocks;
struct Item
{
size_t index;
string str;
uint hash;
}
struct Block
{
ubyte[] bytes;
size_t used;
}
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
{ {
uint hash = 0; uint hash = 0;
@ -419,8 +519,21 @@ private:
return hash; return hash;
} }
enum pageSize = 4096 * 1024; static struct Item
enum bucketCount = 2048; {
size_t index;
string str;
uint hash;
}
static struct Block
{
ubyte[] bytes;
size_t used;
}
static enum pageSize = 4096 * 1024;
static enum bucketCount = 2048;
static enum uint[] sbox = [ static enum uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
@ -488,6 +601,8 @@ private:
0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41, 0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A, 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
]; ];
Item*[] items;
Item*[][bucketCount] buckets;
Block[] blocks;
} }