moved some functions from cgi.d

2025-11-03 19:21:02 -05:00 · 2025-11-03 19:21:02 -05:00 · c94cc54f43
parent aa2e04e6ca
commit c94cc54f43
1 changed files with 628 additions and 0 deletions
--- a/uri.d
+++ b/uri.d
@ -8,6 +8,9 @@ module arsd.uri;
 import arsd.core;
 import arsd.conv;
 import arsd.string;
 alias encodeUriComponent = arsd.core.encodeUriComponent;
 alias decodeUriComponent = arsd.core.decodeUriComponent;
@ -18,3 +21,628 @@ alias decodeComponent = decodeUriComponent;
 // FIXME: merge and pull Uri struct from http2 and cgi. maybe via core.
 // might also put base64 in here....
 /++
 	Represents a URI. It offers named access to the components and relative uri resolution, though as a user of the library, you'd mostly just construct it like `Uri("http://example.com/index.html")`.
 	History:
 		Moved from duplication in [arsd.cgi] and [arsd.http2] to arsd.uri on November 2, 2025.
 +/
 struct Uri {
 	UriString toUriString() {
 		return UriString(toString());
 	}
 	alias toUriString this; // blargh idk a url really is a string, but should it be implicit?
 	// scheme://userinfo@host:port/path?query#fragment
 	string scheme; /// e.g. "http" in "http://example.com/"
 	string userinfo; /// the username (and possibly a password) in the uri
 	string host; /// the domain name. note it may be an ip address or have percent encoding too.
 	int port; /// port number, if given. Will be zero if a port was not explicitly given
 	string path; /// e.g. "/folder/file.html" in "http://example.com/folder/file.html"
 	string query; /// the stuff after the ? in a uri
 	string fragment; /// the stuff after the # in a uri.
 	// cgi.d specific.......
 	// idk if i want to keep these, since the functions they wrap are used many, many, many times in existing code, so this is either an unnecessary alias or a gratuitous break of compatibility
 	// the decode ones need to keep different names anyway because we can't overload on return values...
 	static string encode(string s) { return encodeUriComponent(s); }
 	static string encode(string[string] s) { return encodeVariables(s); }
 	static string encode(string[][string] s) { return encodeVariables(s); }
 	/++
 		Parses an existing uri string (which should be pre-validated) into this further detailed structure.
 		History:
 			Added November 2, 2025.
 	+/
 	this(UriString uriString) {
 		this(uriString.toString());
 	}
 	/++
 		Transforms an interpolated expression sequence into a uri, encoding as appropriate as it reads.
 		History:
 			Added November 2, 2025.
 	+/
 	this(Args...)(InterpolationHeader header, Args args, InterpolationFooter footer) {
 		// will need to use iraw here for some cases. paths may partially encoded but still allow slashes, prolly needs a type.
 		// so like $(path(x)) or $(queryString(x)) or maybe isemi or something. or make user split it into a string[] then recombine here....
 		string thing;
 		foreach(arg; args) {
 			static if(is(typeof(arg) == InterpolationHeader))
 				{}
 			else
 			static if(is(typeof(arg) == InterpolationFooter))
 				{}
 			else
 			static if(is(typeof(arg) == InterpolatedLiteral!part, string part))
 				thing ~= part;
 			else
 			static if(is(typeof(arg) == InterpolatedExpression!code, string code))
 				{}
 			else
 			static if(is(typeof(arg) == iraw))
 				thing ~= iraw.s;
 			else
 				thing ~= encodeUriComponent(to!string(arg));
 		}
 		this(thing);
 	}
 	unittest {
 		string bar = "12/";
 		string baz = "&omg";
 		auto uri = Uri(i"http://example.com/foo/$bar?thing=$baz");
 		assert(uri.toString() == "http://example.com/foo/12%2F?thing=%26omg");
 	}
 	/// Breaks down a uri string to its components
 	this(string uri) {
 		size_t lastGoodIndex;
 		foreach(char ch; uri) {
 			if(ch > 127) {
 				break;
 			}
 			lastGoodIndex++;
 		}
 		string replacement = uri[0 .. lastGoodIndex];
 		foreach(char ch; uri[lastGoodIndex .. $]) {
 			if(ch > 127) {
 				// need to percent-encode any non-ascii in it
 				char[3] buffer;
 				buffer[0] = '%';
 				auto first = ch / 16;
 				auto second = ch % 16;
 				first += (first >= 10) ? ('A'-10) : '0';
 				second += (second >= 10) ? ('A'-10) : '0';
 				buffer[1] = cast(char) first;
 				buffer[2] = cast(char) second;
 				replacement ~= buffer[];
 			} else {
 				replacement ~= ch;
 			}
 		}
 		reparse(replacement);
 	}
 	/// Returns `port` if set, otherwise if scheme is https 443, otherwise always 80
 	int effectivePort() const @property nothrow pure @safe @nogc {
 		return port != 0 ? port
 			: scheme == "https" ? 443 : 80;
 	}
 	package string unixSocketPath = null;
 	/// Indicates it should be accessed through a unix socket instead of regular tcp. Returns new version without modifying this object.
 	Uri viaUnixSocket(string path) const {
 		Uri copy = this;
 		copy.unixSocketPath = path;
 		return copy;
 	}
 	/// Goes through a unix socket in the abstract namespace (linux only). Returns new version without modifying this object.
 	version(linux)
 	Uri viaAbstractSocket(string path) const {
 		Uri copy = this;
 		copy.unixSocketPath = "\0" ~ path;
 		return copy;
 	}
 	// these are like javascript's location.search and location.hash
 	string search() const {
 		return query.length ? ("?" ~ query) : "";
 	}
 	string hash() const {
 		return fragment.length ? ("#" ~ fragment) : "";
 	}
 	private void reparse(string uri) {
 		// from RFC 3986
 		// the ctRegex triples the compile time and makes ugly errors for no real benefit
 		// it was a nice experiment but just not worth it.
 		// enum ctr = ctRegex!r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
 		/*
 			Captures:
 				0 = whole url
 				1 = scheme, with :
 				2 = scheme, no :
 				3 = authority, with //
 				4 = authority, no //
 				5 = path
 				6 = query string, with ?
 				7 = query string, no ?
 				8 = anchor, with #
 				9 = anchor, no #
 		*/
 		// Yikes, even regular, non-CT regex is also unacceptably slow to compile. 1.9s on my computer!
 		// instead, I will DIY and cut that down to 0.6s on the same computer.
 		/*
 				Note that authority is
 					user:password@domain:port
 				where the user:password@ part is optional, and the :port is optional.
 				Regex translation:
 				Scheme cannot have :, /, ?, or # in it, and must have one or more chars and end in a :. It is optional, but must be first.
 				Authority must start with //, but cannot have any other /, ?, or # in it. It is optional.
 				Path cannot have any ? or # in it. It is optional.
 				Query must start with ? and must not have # in it. It is optional.
 				Anchor must start with # and can have anything else in it to end of string. It is optional.
 		*/
 		this = Uri.init; // reset all state
 		// empty uri = nothing special
 		if(uri.length == 0) {
 			return;
 		}
 		size_t idx;
 		scheme_loop: foreach(char c; uri[idx .. $]) {
 			switch(c) {
 				case ':':
 				case '/':
 				case '?':
 				case '#':
 					break scheme_loop;
 				default:
 			}
 			idx++;
 		}
 		if(idx == 0 && uri[idx] == ':') {
 			// this is actually a path! we skip way ahead
 			goto path_loop;
 		}
 		if(idx == uri.length) {
 			// the whole thing is a path, apparently
 			path = uri;
 			return;
 		}
 		if(idx > 0 && uri[idx] == ':') {
 			scheme = uri[0 .. idx];
 			idx++;
 		} else {
 			// we need to rewind; it found a / but no :, so the whole thing is prolly a path...
 			idx = 0;
 		}
 		if(idx + 2 < uri.length && uri[idx .. idx + 2] == "//") {
 			// we have an authority....
 			idx += 2;
 			auto authority_start = idx;
 			authority_loop: foreach(char c; uri[idx .. $]) {
 				switch(c) {
 					case '/':
 					case '?':
 					case '#':
 						break authority_loop;
 					default:
 				}
 				idx++;
 			}
 			auto authority = uri[authority_start .. idx];
 			auto idx2 = authority.indexOf("@");
 			if(idx2 != -1) {
 				userinfo = authority[0 .. idx2];
 				authority = authority[idx2 + 1 .. $];
 			}
 			if(authority.length && authority[0] == '[') {
 				// ipv6 address special casing
 				idx2 = authority.indexOf("]");
 				if(idx2 != -1) {
 					auto end = authority[idx2 + 1 .. $];
 					if(end.length && end[0] == ':')
 						idx2 = idx2 + 1;
 					else
 						idx2 = -1;
 				}
 			} else {
 				idx2 = authority.indexOf(":");
 			}
 			if(idx2 == -1) {
 				port = 0; // 0 means not specified; we should use the default for the scheme
 				host = authority;
 			} else {
 				host = authority[0 .. idx2];
 				if(idx2 + 1 < authority.length)
 					port = to!int(authority[idx2 + 1 .. $]);
 				else
 					port = 0;
 			}
 		}
 		path_loop:
 		auto path_start = idx;
 		foreach(char c; uri[idx .. $]) {
 			if(c == '?' || c == '#')
 				break;
 			idx++;
 		}
 		path = uri[path_start .. idx];
 		if(idx == uri.length)
 			return; // nothing more to examine...
 		if(uri[idx] == '?') {
 			idx++;
 			auto query_start = idx;
 			foreach(char c; uri[idx .. $]) {
 				if(c == '#')
 					break;
 				idx++;
 			}
 			query = uri[query_start .. idx];
 		}
 		if(idx < uri.length && uri[idx] == '#') {
 			idx++;
 			fragment = uri[idx .. $];
 		}
 		// uriInvalidated = false;
 	}
 	private string rebuildUri() const {
 		string ret;
 		if(scheme.length)
 			ret ~= scheme ~ ":";
 		if(userinfo.length || host.length)
 			ret ~= "//";
 		if(userinfo.length)
 			ret ~= userinfo ~ "@";
 		if(host.length)
 			ret ~= host;
 		if(port)
 			ret ~= ":" ~ to!string(port);
 		ret ~= path;
 		if(query.length)
 			ret ~= "?" ~ query;
 		if(fragment.length)
 			ret ~= "#" ~ fragment;
 		// uri = ret;
 		// uriInvalidated = false;
 		return ret;
 	}
 	/// Converts the broken down parts back into a complete string
 	string toString() const {
 		// if(uriInvalidated)
 			return rebuildUri();
 	}
 	/// Returns a new absolute Uri given a base. It treats this one as
 	/// relative where possible, but absolute if not. (If protocol, domain, or
 	/// other info is not set, the new one inherits it from the base.)
 	///
 	/// Browsers use a function like this to figure out links in html.
 	Uri basedOn(in Uri baseUrl) const {
 		Uri n = this; // copies
 		if(n.scheme == "data")
 			return n;
 		// n.uriInvalidated = true; // make sure we regenerate...
 		// userinfo is not inherited... is this wrong?
 		// if anything is given in the existing url, we don't use the base anymore.
 		if(n.scheme.length == 0) {
 			n.scheme = baseUrl.scheme;
 			if(n.host.length == 0) {
 				n.host = baseUrl.host;
 				if(n.port == 0) {
 					n.port = baseUrl.port;
 					if(n.path.length > 0 && n.path[0] != '/') {
 						auto b = baseUrl.path[0 .. baseUrl.path.lastIndexOf("/") + 1];
 						if(b.length == 0)
 							b = "/";
 						n.path = b ~ n.path;
 					} else if(n.path.length == 0) {
 						n.path = baseUrl.path;
 					}
 				}
 			}
 		}
 		n.removeDots();
 		// if still basically talking to the same thing, we should inherit the unix path
 		// too since basically the unix path is saying for this service, always use this override.
 		if(n.host == baseUrl.host && n.scheme == baseUrl.scheme && n.port == baseUrl.port)
 			n.unixSocketPath = baseUrl.unixSocketPath;
 		return n;
 	}
 	/++
 		Resolves ../ and ./ parts of the path. Used in the implementation of [basedOn] and you could also use it to normalize things.
 	+/
 	void removeDots() {
 		auto parts = this.path.split("/");
 		string[] toKeep;
 		foreach(part; parts) {
 			if(part == ".") {
 				continue;
 			} else if(part == "..") {
 				//if(toKeep.length > 1)
 					toKeep = toKeep[0 .. $-1];
 				//else
 					//toKeep = [""];
 				continue;
 			} else {
 				//if(toKeep.length && toKeep[$-1].length == 0 && part.length == 0)
 					//continue; // skip a `//` situation
 				toKeep ~= part;
 			}
 		}
 		auto path = toKeep.join("/");
 		if(path.length && path[0] != '/')
 			path = "/" ~ path;
 		this.path = path;
 	}
 	unittest {
 		auto uri = Uri("test.html");
 		assert(uri.path == "test.html");
 		uri = Uri("path/1/lol");
 		assert(uri.path == "path/1/lol");
 		uri = Uri("http://me@example.com");
 		assert(uri.scheme == "http");
 		assert(uri.userinfo == "me");
 		assert(uri.host == "example.com");
 		uri = Uri("http://example.com/#a");
 		assert(uri.scheme == "http");
 		assert(uri.host == "example.com");
 		assert(uri.fragment == "a");
 		uri = Uri("#foo");
 		assert(uri.fragment == "foo");
 		uri = Uri("?lol");
 		assert(uri.query == "lol");
 		uri = Uri("#foo?lol");
 		assert(uri.fragment == "foo?lol");
 		uri = Uri("?lol#foo");
 		assert(uri.fragment == "foo");
 		assert(uri.query == "lol");
 		uri = Uri("http://127.0.0.1/");
 		assert(uri.host == "127.0.0.1");
 		assert(uri.port == 0);
 		uri = Uri("http://127.0.0.1:123/");
 		assert(uri.host == "127.0.0.1");
 		assert(uri.port == 123);
 		uri = Uri("http://[ff:ff::0]/");
 		assert(uri.host == "[ff:ff::0]");
 		uri = Uri("http://[ff:ff::0]:123/");
 		assert(uri.host == "[ff:ff::0]");
 		assert(uri.port == 123);
 	}
 	// This can sometimes be a big pain in the butt for me, so lots of copy/paste here to cover
 	// the possibilities.
 	unittest {
 		auto url = Uri("cool.html"); // checking relative links
 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/cool.html");
 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/cool.html");
 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/cool.html");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/cool.html");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html");
 		url = Uri("/something/cool.html"); // same server, different path
 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/something/cool.html");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html");
 		url = Uri("?query=answer"); // same path. server, protocol, and port, just different query string and fragment
 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/test.html?query=answer");
 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/test.html?query=answer");
 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/?query=answer");
 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/?query=answer");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/test.html?query=answer");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/test.html?query=answer");
 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/test.html?query=answer");
 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer");
 		url = Uri("/test/bar");
 		assert(Uri("./").basedOn(url) == "/test/", Uri("./").basedOn(url));
 		assert(Uri("../").basedOn(url) == "/");
 		url = Uri("http://example.com/");
 		assert(Uri("../foo").basedOn(url) == "http://example.com/foo");
 		//auto uriBefore = url;
 		url = Uri("#anchor"); // everything should remain the same except the anchor
 		//uriBefore.anchor = "anchor");
 		//assert(url == uriBefore);
 		url = Uri("//example.com"); // same protocol, but different server. the path here should be blank.
 		url = Uri("//example.com/example.html"); // same protocol, but different server and path
 		url = Uri("http://example.com/test.html"); // completely absolute link should never be modified
 		url = Uri("http://example.com"); // completely absolute link should never be modified, even if it has no path
 		// FIXME: add something for port too
 	}
 }
 /// Makes a data:// uri that can be used as links in most newer browsers (IE8+).
 string makeDataUrl()(string mimeType, in void[] data) {
 	import std.base64; // FIXME then i can remove the () template
 	auto data64 = Base64.encode(cast(const(ubyte[])) data);
 	return "data:" ~ mimeType ~ ";base64," ~ cast(string)(data64);
 }
 /// breaks down a url encoded string
 string[][string] decodeVariables(string data, string separator = "&", string[]* namesInOrder = null, string[]* valuesInOrder = null) {
 	auto vars = data.split(separator);
 	string[][string] _get;
 	foreach(var; vars) {
 		auto equal = var.indexOf("=");
 		string name;
 		string value;
 		if(equal == -1) {
 			name = decodeUriComponent(var);
 			value = "";
 		} else {
 			//_get[decodeUriComponent(var[0..equal])] ~= decodeUriComponent(var[equal + 1 .. $].replace("+", " "));
 			// stupid + -> space conversion.
 			name = decodeUriComponent(var[0..equal].replace("+", " "));
 			value = decodeUriComponent(var[equal + 1 .. $].replace("+", " "));
 		}
 		_get[name] ~= value;
 		if(namesInOrder)
 			(*namesInOrder) ~= name;
 		if(valuesInOrder)
 			(*valuesInOrder) ~= value;
 	}
 	return _get;
 }
 /// breaks down a url encoded string, but only returns the last value of any array
 string[string] decodeVariablesSingle(string data) {
 	string[string] va;
 	auto varArray = decodeVariables(data);
 	foreach(k, v; varArray)
 		va[k] = v[$-1];
 	return va;
 }
 /// url encodes the whole string
 string encodeVariables(in string[string] data) {
 	string ret;
 	bool outputted = false;
 	foreach(k, v; data) {
 		if(outputted)
 			ret ~= "&";
 		else
 			outputted = true;
 		ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v);
 	}
 	return ret;
 }
 /// url encodes a whole string
 string encodeVariables(in string[][string] data) {
 	string ret;
 	bool outputted = false;
 	foreach(k, arr; data) {
 		foreach(v; arr) {
 			if(outputted)
 				ret ~= "&";
 			else
 				outputted = true;
 			ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v);
 		}
 	}
 	return ret;
 }
 /// Encodes all but the explicitly unreserved characters per rfc 3986
 /// Alphanumeric and -_.~ are the only ones left unencoded
 /// name is borrowed from php
 string rawurlencode(in char[] data) {
 	string ret;
 	ret.reserve(data.length * 2);
 	foreach(char c; data) {
 		if(
 			(c >= 'a' && c <= 'z') ||
 			(c >= 'A' && c <= 'Z') ||
 			(c >= '0' && c <= '9') ||
 			c == '-' || c == '_' || c == '.' || c == '~')
 		{
 			ret ~= c;
 		} else {
 			ret ~= '%';
 			// since we iterate on char, this should give us the octets of the full utf8 string
 			ret ~= toHexUpper(c);
 		}
 	}
 	return ret;
 }
 char[2] toHexUpper(ubyte num) {
 	char[2] ret = 0;
 	ret[0] = num / 16;
 	ret[1] = num % 16;
 	ret[0] += cast(char)(ret[0] >= 10 ? 'A' : '0');
 	ret[1] += cast(char)(ret[1] >= 10 ? 'A' : '0');
 	return ret;
 }