/++ Future public interface to the Uri struct and encode/decode component functions. History: Added May 26, 2025 +/ module arsd.uri; import arsd.core; import arsd.conv; import arsd.string; alias encodeUriComponent = arsd.core.encodeUriComponent; alias decodeUriComponent = arsd.core.decodeUriComponent; // phobos compatibility names alias encodeComponent = encodeUriComponent; alias decodeComponent = decodeUriComponent; // FIXME: merge and pull Uri struct from http2 and cgi. maybe via core. // might also put base64 in here.... /++ Represents a URI. It offers named access to the components and relative uri resolution, though as a user of the library, you'd mostly just construct it like `Uri("http://example.com/index.html")`. History: Moved from duplication in [arsd.cgi] and [arsd.http2] to arsd.uri on November 2, 2025. +/ struct Uri { UriString toUriString() { return UriString(toString()); } alias toUriString this; // blargh idk a url really is a string, but should it be implicit? // scheme://userinfo@host:port/path?query#fragment string scheme; /// e.g. "http" in "http://example.com/" string userinfo; /// the username (and possibly a password) in the uri string host; /// the domain name. note it may be an ip address or have percent encoding too. int port; /// port number, if given. Will be zero if a port was not explicitly given string path; /// e.g. "/folder/file.html" in "http://example.com/folder/file.html" string query; /// the stuff after the ? in a uri string fragment; /// the stuff after the # in a uri. // cgi.d specific....... // idk if i want to keep these, since the functions they wrap are used many, many, many times in existing code, so this is either an unnecessary alias or a gratuitous break of compatibility // the decode ones need to keep different names anyway because we can't overload on return values... static string encode(string s) { return encodeUriComponent(s); } static string encode(string[string] s) { return encodeVariables(s); } static string encode(string[][string] s) { return encodeVariables(s); } /++ Parses an existing uri string (which should be pre-validated) into this further detailed structure. History: Added November 2, 2025. +/ this(UriString uriString) { this(uriString.toString()); } /++ Transforms an interpolated expression sequence into a uri, encoding as appropriate as it reads. History: Added November 2, 2025. +/ this(Args...)(InterpolationHeader header, Args args, InterpolationFooter footer) { // will need to use iraw here for some cases. paths may partially encoded but still allow slashes, prolly needs a type. // so like $(path(x)) or $(queryString(x)) or maybe isemi or something. or make user split it into a string[] then recombine here.... string thing; foreach(arg; args) { static if(is(typeof(arg) == InterpolationHeader)) {} else static if(is(typeof(arg) == InterpolationFooter)) {} else static if(is(typeof(arg) == InterpolatedLiteral!part, string part)) thing ~= part; else static if(is(typeof(arg) == InterpolatedExpression!code, string code)) {} else static if(is(typeof(arg) == iraw)) thing ~= iraw.s; else thing ~= encodeUriComponent(to!string(arg)); } this(thing); } unittest { string bar = "12/"; string baz = "&omg"; auto uri = Uri(i"http://example.com/foo/$bar?thing=$baz"); assert(uri.toString() == "http://example.com/foo/12%2F?thing=%26omg"); } /// Breaks down a uri string to its components this(string uri) { size_t lastGoodIndex; foreach(char ch; uri) { if(ch > 127) { break; } lastGoodIndex++; } string replacement = uri[0 .. lastGoodIndex]; foreach(char ch; uri[lastGoodIndex .. $]) { if(ch > 127) { // need to percent-encode any non-ascii in it char[3] buffer; buffer[0] = '%'; auto first = ch / 16; auto second = ch % 16; first += (first >= 10) ? ('A'-10) : '0'; second += (second >= 10) ? ('A'-10) : '0'; buffer[1] = cast(char) first; buffer[2] = cast(char) second; replacement ~= buffer[]; } else { replacement ~= ch; } } reparse(replacement); } /// Returns `port` if set, otherwise if scheme is https 443, otherwise always 80 int effectivePort() const @property nothrow pure @safe @nogc { return port != 0 ? port : scheme == "https" ? 443 : 80; } package string unixSocketPath = null; /// Indicates it should be accessed through a unix socket instead of regular tcp. Returns new version without modifying this object. Uri viaUnixSocket(string path) const { Uri copy = this; copy.unixSocketPath = path; return copy; } /// Goes through a unix socket in the abstract namespace (linux only). Returns new version without modifying this object. version(linux) Uri viaAbstractSocket(string path) const { Uri copy = this; copy.unixSocketPath = "\0" ~ path; return copy; } // these are like javascript's location.search and location.hash string search() const { return query.length ? ("?" ~ query) : ""; } string hash() const { return fragment.length ? ("#" ~ fragment) : ""; } private void reparse(string uri) { // from RFC 3986 // the ctRegex triples the compile time and makes ugly errors for no real benefit // it was a nice experiment but just not worth it. // enum ctr = ctRegex!r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?"; /* Captures: 0 = whole url 1 = scheme, with : 2 = scheme, no : 3 = authority, with // 4 = authority, no // 5 = path 6 = query string, with ? 7 = query string, no ? 8 = anchor, with # 9 = anchor, no # */ // Yikes, even regular, non-CT regex is also unacceptably slow to compile. 1.9s on my computer! // instead, I will DIY and cut that down to 0.6s on the same computer. /* Note that authority is user:password@domain:port where the user:password@ part is optional, and the :port is optional. Regex translation: Scheme cannot have :, /, ?, or # in it, and must have one or more chars and end in a :. It is optional, but must be first. Authority must start with //, but cannot have any other /, ?, or # in it. It is optional. Path cannot have any ? or # in it. It is optional. Query must start with ? and must not have # in it. It is optional. Anchor must start with # and can have anything else in it to end of string. It is optional. */ this = Uri.init; // reset all state // empty uri = nothing special if(uri.length == 0) { return; } size_t idx; scheme_loop: foreach(char c; uri[idx .. $]) { switch(c) { case ':': case '/': case '?': case '#': break scheme_loop; default: } idx++; } if(idx == 0 && uri[idx] == ':') { // this is actually a path! we skip way ahead goto path_loop; } if(idx == uri.length) { // the whole thing is a path, apparently path = uri; return; } if(idx > 0 && uri[idx] == ':') { scheme = uri[0 .. idx]; idx++; } else { // we need to rewind; it found a / but no :, so the whole thing is prolly a path... idx = 0; } if(idx + 2 < uri.length && uri[idx .. idx + 2] == "//") { // we have an authority.... idx += 2; auto authority_start = idx; authority_loop: foreach(char c; uri[idx .. $]) { switch(c) { case '/': case '?': case '#': break authority_loop; default: } idx++; } auto authority = uri[authority_start .. idx]; auto idx2 = authority.indexOf("@"); if(idx2 != -1) { userinfo = authority[0 .. idx2]; authority = authority[idx2 + 1 .. $]; } if(authority.length && authority[0] == '[') { // ipv6 address special casing idx2 = authority.indexOf("]"); if(idx2 != -1) { auto end = authority[idx2 + 1 .. $]; if(end.length && end[0] == ':') idx2 = idx2 + 1; else idx2 = -1; } } else { idx2 = authority.indexOf(":"); } if(idx2 == -1) { port = 0; // 0 means not specified; we should use the default for the scheme host = authority; } else { host = authority[0 .. idx2]; if(idx2 + 1 < authority.length) port = to!int(authority[idx2 + 1 .. $]); else port = 0; } } path_loop: auto path_start = idx; foreach(char c; uri[idx .. $]) { if(c == '?' || c == '#') break; idx++; } path = uri[path_start .. idx]; if(idx == uri.length) return; // nothing more to examine... if(uri[idx] == '?') { idx++; auto query_start = idx; foreach(char c; uri[idx .. $]) { if(c == '#') break; idx++; } query = uri[query_start .. idx]; } if(idx < uri.length && uri[idx] == '#') { idx++; fragment = uri[idx .. $]; } // uriInvalidated = false; } private string rebuildUri() const { string ret; if(scheme.length) ret ~= scheme ~ ":"; if(userinfo.length || host.length) ret ~= "//"; if(userinfo.length) ret ~= userinfo ~ "@"; if(host.length) ret ~= host; if(port) ret ~= ":" ~ to!string(port); ret ~= path; if(query.length) ret ~= "?" ~ query; if(fragment.length) ret ~= "#" ~ fragment; // uri = ret; // uriInvalidated = false; return ret; } /// Converts the broken down parts back into a complete string string toString() const { // if(uriInvalidated) return rebuildUri(); } /// Returns a new absolute Uri given a base. It treats this one as /// relative where possible, but absolute if not. (If protocol, domain, or /// other info is not set, the new one inherits it from the base.) /// /// Browsers use a function like this to figure out links in html. Uri basedOn(in Uri baseUrl) const { Uri n = this; // copies if(n.scheme == "data") return n; // n.uriInvalidated = true; // make sure we regenerate... // userinfo is not inherited... is this wrong? // if anything is given in the existing url, we don't use the base anymore. if(n.scheme.length == 0) { n.scheme = baseUrl.scheme; if(n.host.length == 0) { n.host = baseUrl.host; if(n.port == 0) { n.port = baseUrl.port; if(n.path.length > 0 && n.path[0] != '/') { auto b = baseUrl.path[0 .. baseUrl.path.lastIndexOf("/") + 1]; if(b.length == 0) b = "/"; n.path = b ~ n.path; } else if(n.path.length == 0) { n.path = baseUrl.path; } } } } n.removeDots(); // if still basically talking to the same thing, we should inherit the unix path // too since basically the unix path is saying for this service, always use this override. if(n.host == baseUrl.host && n.scheme == baseUrl.scheme && n.port == baseUrl.port) n.unixSocketPath = baseUrl.unixSocketPath; return n; } /++ Resolves ../ and ./ parts of the path. Used in the implementation of [basedOn] and you could also use it to normalize things. +/ void removeDots() { auto parts = this.path.split("/"); string[] toKeep; foreach(part; parts) { if(part == ".") { continue; } else if(part == "..") { //if(toKeep.length > 1) toKeep = toKeep[0 .. $-1]; //else //toKeep = [""]; continue; } else { //if(toKeep.length && toKeep[$-1].length == 0 && part.length == 0) //continue; // skip a `//` situation toKeep ~= part; } } auto path = toKeep.join("/"); if(path.length && path[0] != '/') path = "/" ~ path; this.path = path; } unittest { auto uri = Uri("test.html"); assert(uri.path == "test.html"); uri = Uri("path/1/lol"); assert(uri.path == "path/1/lol"); uri = Uri("http://me@example.com"); assert(uri.scheme == "http"); assert(uri.userinfo == "me"); assert(uri.host == "example.com"); uri = Uri("http://example.com/#a"); assert(uri.scheme == "http"); assert(uri.host == "example.com"); assert(uri.fragment == "a"); uri = Uri("#foo"); assert(uri.fragment == "foo"); uri = Uri("?lol"); assert(uri.query == "lol"); uri = Uri("#foo?lol"); assert(uri.fragment == "foo?lol"); uri = Uri("?lol#foo"); assert(uri.fragment == "foo"); assert(uri.query == "lol"); uri = Uri("http://127.0.0.1/"); assert(uri.host == "127.0.0.1"); assert(uri.port == 0); uri = Uri("http://127.0.0.1:123/"); assert(uri.host == "127.0.0.1"); assert(uri.port == 123); uri = Uri("http://[ff:ff::0]/"); assert(uri.host == "[ff:ff::0]"); uri = Uri("http://[ff:ff::0]:123/"); assert(uri.host == "[ff:ff::0]"); assert(uri.port == 123); } // This can sometimes be a big pain in the butt for me, so lots of copy/paste here to cover // the possibilities. unittest { auto url = Uri("cool.html"); // checking relative links assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/cool.html"); assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/cool.html"); assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/cool.html"); assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/cool.html"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/cool.html"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html"); url = Uri("/something/cool.html"); // same server, different path assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/something/cool.html"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html"); url = Uri("?query=answer"); // same path. server, protocol, and port, just different query string and fragment assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/test.html?query=answer"); assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/test.html?query=answer"); assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/?query=answer"); assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/?query=answer"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/test.html?query=answer"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/test.html?query=answer"); assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/test.html?query=answer"); assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer"); url = Uri("/test/bar"); assert(Uri("./").basedOn(url) == "/test/", Uri("./").basedOn(url)); assert(Uri("../").basedOn(url) == "/"); url = Uri("http://example.com/"); assert(Uri("../foo").basedOn(url) == "http://example.com/foo"); //auto uriBefore = url; url = Uri("#anchor"); // everything should remain the same except the anchor //uriBefore.anchor = "anchor"); //assert(url == uriBefore); url = Uri("//example.com"); // same protocol, but different server. the path here should be blank. url = Uri("//example.com/example.html"); // same protocol, but different server and path url = Uri("http://example.com/test.html"); // completely absolute link should never be modified url = Uri("http://example.com"); // completely absolute link should never be modified, even if it has no path // FIXME: add something for port too } } /// Makes a data:// uri that can be used as links in most newer browsers (IE8+). string makeDataUrl()(string mimeType, in void[] data) { import std.base64; // FIXME then i can remove the () template auto data64 = Base64.encode(cast(const(ubyte[])) data); return "data:" ~ mimeType ~ ";base64," ~ cast(string)(data64); } /// breaks down a url encoded string string[][string] decodeVariables(string data, string separator = "&", string[]* namesInOrder = null, string[]* valuesInOrder = null) { auto vars = data.split(separator); string[][string] _get; foreach(var; vars) { auto equal = var.indexOf("="); string name; string value; if(equal == -1) { name = decodeUriComponent(var); value = ""; } else { //_get[decodeUriComponent(var[0..equal])] ~= decodeUriComponent(var[equal + 1 .. $].replace("+", " ")); // stupid + -> space conversion. name = decodeUriComponent(var[0..equal].replace("+", " ")); value = decodeUriComponent(var[equal + 1 .. $].replace("+", " ")); } _get[name] ~= value; if(namesInOrder) (*namesInOrder) ~= name; if(valuesInOrder) (*valuesInOrder) ~= value; } return _get; } /// breaks down a url encoded string, but only returns the last value of any array string[string] decodeVariablesSingle(string data) { string[string] va; auto varArray = decodeVariables(data); foreach(k, v; varArray) va[k] = v[$-1]; return va; } /// url encodes the whole string string encodeVariables(in string[string] data) { string ret; bool outputted = false; foreach(k, v; data) { if(outputted) ret ~= "&"; else outputted = true; ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v); } return ret; } /// url encodes a whole string string encodeVariables(in string[][string] data) { string ret; bool outputted = false; foreach(k, arr; data) { foreach(v; arr) { if(outputted) ret ~= "&"; else outputted = true; ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v); } } return ret; } /// Encodes all but the explicitly unreserved characters per rfc 3986 /// Alphanumeric and -_.~ are the only ones left unencoded /// name is borrowed from php string rawurlencode(in char[] data) { string ret; ret.reserve(data.length * 2); foreach(char c; data) { if( (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == '~') { ret ~= c; } else { ret ~= '%'; // since we iterate on char, this should give us the octets of the full utf8 string ret ~= toHexUpper(c); } } return ret; } char[2] toHexUpper(ubyte num) { char[2] ret = 0; ret[0] = num / 16; ret[1] = num % 16; ret[0] += cast(char)(ret[0] >= 10 ? 'A' : '0'); ret[1] += cast(char)(ret[1] >= 10 ? 'A' : '0'); return ret; }