Created
April 15, 2022 15:43
-
-
Save racerxdl/dd17586d6b15681f41027d5b06c4edcd to your computer and use it in GitHub Desktop.
C++ Implementation of golang URL package - I did that for a project, and I decided to save it here.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "common/url.h" | |
#include <fmt/format.h> | |
#include <cctype> | |
using namespace ProtoRock::Http; | |
enum EncodingMode { | |
encodePath = 1, | |
encodePathSegment, | |
encodeHost, | |
encodeZone, | |
encodeUserPassword, | |
encodeQueryComponent, | |
encodeFragment, | |
}; | |
char unhex(char c) { | |
if ('0' <= c && c <= '9') { | |
return c - '0'; | |
} | |
if ('a' <= c && c <= 'f') { | |
return c - 'a' + 10; | |
} | |
if ('A' <= c && c <= 'F') { | |
return c - 'A' + 10; | |
} | |
return 0; | |
} | |
const char *upperhex = "0123456789ABCDEF"; | |
// Return true if the specified character should be escaped when | |
// appearing in a URL string, according to RFC 3986. | |
// | |
// Please be informed that for now shouldEscape does not check all | |
// reserved characters correctly. See golang.org/issue/5684. | |
bool shouldEscape(char c, EncodingMode mode) { | |
// §2.3 Unreserved characters (alphanum) | |
if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { | |
return false; | |
} | |
if (mode == encodeHost || mode == encodeZone) { | |
// §3.2.2 Host allows | |
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
// as part of reg-name. | |
// We add : because we include :port as part of host. | |
// We add [ ] because we include [ipv6]:port as part of host. | |
// We add < > because they're the only characters left that | |
// we could possibly allow, and Parse will reject them if we | |
// escape them (because hosts can't use %-encoding for | |
// ASCII bytes). | |
switch (c) { | |
case '!': | |
case '$': | |
case '&': | |
case '\'': | |
case '(': | |
case ')': | |
case '*': | |
case '+': | |
case ',': | |
case ';': | |
case '=': | |
case ':': | |
case '[': | |
case ']': | |
case '<': | |
case '>': | |
case '"': | |
return false; | |
} | |
} | |
switch (c) { | |
// §2.3 Unreserved characters (mark) | |
case '-': | |
case '_': | |
case '.': | |
case '~': | |
return false; | |
// §2.2 Reserved characters (reserved) | |
case '$': | |
case '&': | |
case '+': | |
case ',': | |
case '/': | |
case ':': | |
case ';': | |
case '=': | |
case '?': | |
case '@': | |
// Different sections of the URL allow a few of | |
// the reserved characters to appear unescaped. | |
switch (mode) { | |
case encodePath: // §3.3 | |
// The RFC allows : @ & = + $ but saves / ; , for assigning | |
// meaning to individual path segments. This package | |
// only manipulates the path as a whole, so we allow those | |
// last three as well. That leaves only ? to escape. | |
return c == '?'; | |
case encodePathSegment: // §3.3 | |
// The RFC allows : @ & = + $ but saves / ; , for assigning | |
// meaning to individual path segments. | |
return c == '/' || c == ';' || c == ',' || c == '?'; | |
case encodeUserPassword: // §3.2.1 | |
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in | |
// userinfo, so we must escape only '@', '/', and '?'. | |
// The parsing of userinfo treats ':' as special so we must escape | |
// that too. | |
return c == '@' || c == '/' || c == '?' || c == ':'; | |
case encodeQueryComponent: // §3.4 | |
// The RFC reserves (so we must escape) everything. | |
return true; | |
case encodeFragment: // §4.1 | |
// The RFC text is silent but the grammar allows | |
// everything: case so escape nothing. | |
return false; | |
} | |
} | |
if (mode == encodeFragment) { | |
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are | |
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not | |
// need to be escaped. To minimize potential breakage, we apply two restrictions: | |
// (1) we always escape sub-delims outside of the fragment, and (2) we always | |
// escape single quote to avoid breaking callers that had previously assumed that | |
// single quotes would be escaped. See issue #19917. | |
switch (c) { | |
case '!': | |
case '(': | |
case ')': | |
case '*': | |
return false; | |
} | |
} | |
return true; | |
} | |
std::string escape(const std::string &s, EncodingMode mode) { | |
auto spaceCount = 0; | |
auto hexCount = 0; | |
for (auto i = 0; i < s.size(); i++) { | |
auto c = s[i]; | |
if (shouldEscape(c, mode)) { | |
if (c == ' ' && mode == encodeQueryComponent) { | |
spaceCount++; | |
} else { | |
hexCount++; | |
} | |
} | |
} | |
if (spaceCount == 0 && hexCount == 0) { | |
return s; | |
} | |
auto required = s.size() + 2 * hexCount; | |
auto t = std::vector<char>(); | |
t.reserve(required); | |
if (hexCount == 0) { | |
t.insert(t.begin(), s.begin(), s.end()); | |
for (auto i = 0; i < s.size(); i++) { | |
if (s[i] == ' ') { | |
t[i] = '+'; | |
} | |
} | |
return std::string(t.begin(), t.end()); | |
} | |
auto j = 0; | |
auto c = 0; | |
for (auto i = 0; i < s.size(); i++) { | |
auto c = s[i]; | |
if (c == ' ' && mode == encodeQueryComponent) { | |
t[j] = '+'; | |
j++; | |
} else if (shouldEscape(c, mode)) { | |
t[j] = '%'; | |
t[j + 1] = upperhex[c >> 4]; | |
t[j + 2] = upperhex[c & 15]; | |
j += 3; | |
} else { | |
t[j] = s[i]; | |
j++; | |
} | |
} | |
return std::string(t.begin(), t.end()); | |
} | |
std::string unescape(std::string s, EncodingMode mode) { | |
// Count %, check that they're well-formed. | |
auto n = 0; | |
auto hasPlus = false; | |
auto tmp = std::string(); | |
auto v = 0; | |
for (int i = 0; i < s.size();) { | |
switch (s[i]) { | |
case '%': | |
n++; | |
if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) { | |
s = std::string(s.begin() + 1, s.end()); | |
if (s.size() > 3) { | |
s = std::string(s.begin(), s.begin() + 3); | |
} | |
throw std::invalid_argument("escape error: " + s); | |
} | |
// Per https://tools.ietf.org/html/rfc3986#page-21 | |
// in the host component %-encoding can only be used | |
// for non-ASCII bytes. | |
// But https://tools.ietf.org/html/rfc6874#section-2 | |
// introduces %25 being allowed to escape a percent sign | |
// in IPv6 scoped-address literals. Yay. | |
tmp = std::string(s.begin() + i, s.begin() + i + 3); | |
if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") { | |
throw std::invalid_argument("escape error: " + tmp); | |
} | |
if (mode == encodeZone) { | |
// RFC 6874 says basically "anything goes" for zone identifiers | |
// and that even non-ASCII can be redundantly escaped, | |
// but it seems prudent to restrict %-escaped bytes here to those | |
// that are valid host name bytes in their unescaped form. | |
// That is, you can use escaping in the zone identifier but not | |
// to introduce bytes you couldn't just write directly. | |
// But Windows puts spaces here! Yay. | |
v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]); | |
tmp = std::string(s.begin() + i, s.begin() + i + 3); | |
if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) { | |
throw std::invalid_argument("escape error: " + tmp); | |
} | |
} | |
i += 3; | |
break; | |
case '+': | |
hasPlus = mode == encodeQueryComponent; | |
i++; | |
break; | |
default: | |
if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) { | |
tmp = std::string(s.begin() + i, s.begin() + i + 1); | |
throw std::invalid_argument("invalid host: " + tmp); | |
} | |
i++; | |
} | |
} | |
if (n == 0 && !hasPlus) { | |
return s; | |
} | |
auto ss = std::stringstream(); | |
for (int i = 0; i < s.size(); i++) { | |
switch (s[i]) { | |
case '%': | |
ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2])); | |
i += 2; | |
break; | |
case '+': | |
ss << ((mode == encodeQueryComponent) ? ' ' : '+'); | |
break; | |
default: | |
ss << s[i]; | |
} | |
} | |
return ss.str(); | |
} | |
bool stringContainsCTLByte(const std::string &s) { | |
for (auto c : s) { | |
if (c < ' ' || c == 0x7f) { | |
return true; | |
} | |
} | |
return false; | |
} | |
std::string getScheme(std::string &url) { | |
int i = 0; | |
std::string scheme; | |
for (auto &c : url) { | |
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { | |
// Do nothing | |
} else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') { | |
if (i == 0) { | |
break; | |
} | |
} else if (c == ':') { | |
scheme = std::string(url.begin(), url.begin() + i); | |
url = std::string(url.begin() + i + 1, url.end()); | |
break; | |
} else { | |
// we have encountered an invalid character, | |
// so there is no valid scheme | |
break; | |
} | |
i++; | |
} | |
return scheme; | |
} | |
// validUserinfo reports whether s is a valid userinfo string per RFC 3986 | |
// Section 3.2.1: | |
// userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) | |
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | |
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" | |
// / "*" / "+" / "," / ";" / "=" | |
// | |
// It doesn't validate pct-encoded. The caller does that via func unescape. | |
bool validUserinfo(const std::string &s) { | |
for (auto r : s) { | |
if ('A' <= r && r <= 'Z') { | |
continue; | |
} | |
if ('a' <= r && r <= 'z') { | |
continue; | |
} | |
if ('0' <= r && r <= '9') { | |
continue; | |
} | |
switch (r) { | |
case '-': | |
case '.': | |
case '_': | |
case ':': | |
case '~': | |
case '!': | |
case '$': | |
case '&': | |
case '\'': | |
case '(': | |
case ')': | |
case '*': | |
case '+': | |
case ',': | |
case ';': | |
case '=': | |
case '%': | |
case '@': | |
continue; | |
default: | |
return false; | |
} | |
} | |
return true; | |
} | |
// validOptionalPort reports whether port is either an empty string | |
// or matches /^:\d*$/ | |
bool validOptionalPort(const std::string &port) { | |
if (port.empty()) { | |
return true; | |
} | |
if (port[0] != ':') { | |
return false; | |
} | |
for (auto b = port.begin() + 1; b < port.end(); b++) { | |
if (*b < '0' || *b > '9') { | |
return false; | |
} | |
} | |
return true; | |
} | |
// parseHost parses host as an authority without user | |
// information. That is, as host[:port]. | |
std::string parseHost(const std::string &host) { | |
int idx; | |
if (!host.empty() && host[0] == '[') { | |
// Parse an IP-Literal in RFC 3986 and RFC 6874. | |
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". | |
idx = host.find_last_of(']'); | |
if (idx >= host.size()) { | |
throw std::invalid_argument("cannot find ']' in host"); | |
} | |
auto colonPort = std::string(host.begin() + idx + 1, host.end()); | |
if (!validOptionalPort(colonPort)) { | |
throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); | |
} | |
// RFC 6874 defines that %25 (%-encoded percent) introduces | |
// the zone identifier, and the zone identifier can use basically | |
// any %-encoding it likes. That's different from the host, which | |
// can only %-encode non-ASCII bytes. | |
// We do impose some restrictions on the zone, to avoid stupidity | |
// like newlines. | |
auto zone = host.find("%25"); | |
if (idx != std::string::npos) { | |
auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost); | |
auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost); | |
auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone); | |
return host1 + host2 + host3; | |
} | |
} else if ((idx = host.find_last_of(':')) < host.size()) { | |
auto colonPort = std::string(host.begin() + idx, host.end()); | |
if (!validOptionalPort(colonPort)) { | |
throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); | |
} | |
} | |
return unescape(host, encodeHost); | |
} | |
void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) { | |
auto i = authority.find_last_of('@'); | |
if (i > authority.size()) { | |
host = parseHost(authority); | |
} else { | |
host = parseHost(std::string(authority.begin() + i + 1, authority.end())); | |
} | |
if (i > authority.size()) { | |
return; | |
} | |
auto userInfo = std::string(authority.begin(), authority.begin() + i); | |
if (!validUserinfo(userInfo)) { | |
throw std::invalid_argument("invalid userinfo"); | |
} | |
auto idx = userInfo.find(':'); | |
if (idx == std::string::npos) { | |
userInfo = unescape(userInfo, encodeUserPassword); | |
ui = UserInfo(userInfo); | |
} else { | |
auto username = std::string(userInfo.begin(), userInfo.begin() + idx); | |
auto password = std::string(userInfo.begin() + idx, userInfo.end()); | |
ui.Username = unescape(username, encodeUserPassword); | |
ui.Password = unescape(username, encodeUserPassword); | |
} | |
} | |
URL URL::Parse(std::string url) { | |
URL u; | |
std::string frag; | |
auto hashIndex = url.find("#"); | |
if (hashIndex != std::string::npos) { | |
frag = std::string(url.begin() + hashIndex, url.end()); | |
url = std::string(url.begin(), url.begin() + hashIndex); | |
} | |
u.setFragment(frag); | |
if (stringContainsCTLByte(url)) { | |
throw std::invalid_argument("invalid url: string contains control bytes"); | |
} | |
if (url == "*") { | |
u.Path = "*"; | |
return u; | |
} | |
auto rest = url; | |
u.Scheme = getScheme(rest); | |
std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); }); | |
if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') { | |
u.ForceQuery = true; | |
rest.pop_back(); | |
} else { | |
auto idx = rest.find("?"); | |
if (idx != std::string::npos) { | |
u.RawQuery = std::string(rest.begin() + idx, rest.end()); | |
rest = std::string(rest.begin(), rest.begin() + idx); | |
} | |
} | |
if (!rest.empty() && rest[0] != '/') { | |
if (!u.Scheme.empty()) { | |
// We consider rootless paths per RFC 3986 as opaque. | |
u.Opaque = rest; | |
return u; | |
} | |
} | |
if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) { | |
auto authority = std::string(rest.begin() + 2, rest.end()); | |
rest = ""; | |
int i = authority.find("/"); | |
if (i != std::string::npos) { | |
rest = std::string(authority.begin() + i, authority.end()); | |
authority = std::string(authority.begin(), authority.begin() + i); | |
} | |
parseAuthority(authority, u.User, u.Host); | |
} | |
u.setPath(rest); | |
return u; | |
} | |
void URL::setFragment(const std::string &f) { | |
Fragment = unescape(f, encodeFragment); | |
auto escf = escape(Fragment, encodeFragment); | |
RawFragment = (escf == f) ? "" : f; | |
} | |
void URL::setPath(const std::string &p) { | |
Path = unescape(p, encodePath); | |
auto escp = escape(Path, encodePath); | |
RawPath = (escp == p) ? "" : p; | |
} | |
std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); } | |
std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); } | |
std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); } | |
std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <algorithm> | |
#include <map> | |
#include <stdexcept> | |
#include "common/common.h" | |
// Based on Golang Implementation | |
// MIT | |
namespace ProtoRock { | |
namespace Http { | |
struct UserInfo { | |
std::string Username; | |
std::string Password; | |
UserInfo() {} | |
UserInfo(const std::string &u) : Username(u){}; | |
}; | |
struct URL { | |
private: | |
void setFragment(const std::string &); | |
void setPath(const std::string &); | |
public: | |
std::string Scheme; | |
// encoded opaque data | |
std::string Opaque; | |
// username and password information | |
UserInfo User; | |
// host or host:port | |
std::string Host; | |
// path (relative paths may omit leading slash) | |
std::string Path; | |
// encoded path hint (see EscapedPath method) | |
std::string RawPath; | |
// append a query ('?') even if RawQuery is empty | |
bool ForceQuery = false; | |
// encoded query values, without '?' | |
std::string RawQuery; | |
// fragment for references, without '#' | |
std::string Fragment; | |
// encoded fragment hint (see EscapedFragment method) | |
std::string RawFragment; | |
static URL Parse(std::string url); | |
static std::string PathEscape(const std::string &path); | |
static std::string PathUnescape(const std::string &path); | |
static std::string QueryEscape(const std::string &query); | |
static std::string QueryUnescape(const std::string &query); | |
}; | |
} // namespace Http | |
} // namespace ProtoRock |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment