Created
June 15, 2015 19:12
-
-
Save sleroux/f5f47b8d17da0d4f87b2 to your computer and use it in GitHub Desktop.
TLD Parsing code from Gecko
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Finds the base domain for a host, with requested number of additional parts. | |
// This will fail, generating an error, if the host is an IPv4/IPv6 address, | |
// if more subdomain parts are requested than are available, or if the hostname | |
// includes characters that are not valid in a URL. Normalization is performed | |
// on the host string and the result will be in UTF8. | |
nsresult | |
nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname, | |
int32_t aAdditionalParts, | |
nsACString &aBaseDomain) | |
{ | |
if (aHostname.IsEmpty()) | |
return NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS; | |
// chomp any trailing dot, and keep track of it for later | |
bool trailingDot = aHostname.Last() == '.'; | |
if (trailingDot) | |
aHostname.Truncate(aHostname.Length() - 1); | |
// check the edge cases of the host being '.' or having a second trailing '.', | |
// since subsequent checks won't catch it. | |
if (aHostname.IsEmpty() || aHostname.Last() == '.') | |
return NS_ERROR_INVALID_ARG; | |
// Check if we're dealing with an IPv4/IPv6 hostname, and return | |
PRNetAddr addr; | |
PRStatus result = PR_StringToNetAddr(aHostname.get(), &addr); | |
if (result == PR_SUCCESS) | |
return NS_ERROR_HOST_IS_IP_ADDRESS; | |
// Walk up the domain tree, most specific to least specific, | |
// looking for matches at each level. Note that a given level may | |
// have multiple attributes (e.g. IsWild() and IsNormal()). | |
const char *prevDomain = nullptr; | |
const char *currDomain = aHostname.get(); | |
const char *nextDot = strchr(currDomain, '.'); | |
const char *end = currDomain + aHostname.Length(); | |
const char *eTLD = currDomain; | |
while (1) { | |
// sanity check the string we're about to look up: it should not begin with | |
// a '.'; this would mean the hostname began with a '.' or had an | |
// embedded '..' sequence. | |
if (*currDomain == '.') | |
return NS_ERROR_INVALID_ARG; | |
// perform the hash lookup. | |
nsDomainEntry *entry = mHash.GetEntry(currDomain); | |
if (entry) { | |
if (entry->IsWild() && prevDomain) { | |
// wildcard rules imply an eTLD one level inferior to the match. | |
eTLD = prevDomain; | |
break; | |
} else if (entry->IsNormal() || !nextDot) { | |
// specific match, or we've hit the top domain level | |
eTLD = currDomain; | |
break; | |
} else if (entry->IsException()) { | |
// exception rules imply an eTLD one level superior to the match. | |
eTLD = nextDot + 1; | |
break; | |
} | |
} | |
if (!nextDot) { | |
// we've hit the top domain level; use it by default. | |
eTLD = currDomain; | |
break; | |
} | |
prevDomain = currDomain; | |
currDomain = nextDot + 1; | |
nextDot = strchr(currDomain, '.'); | |
} | |
const char *begin, *iter; | |
if (aAdditionalParts < 0) { | |
NS_ASSERTION(aAdditionalParts == -1, | |
"aAdditionalParts can't be negative and different from -1"); | |
for (iter = aHostname.get(); iter != eTLD && *iter != '.'; iter++); | |
if (iter != eTLD) { | |
iter++; | |
} | |
if (iter != eTLD) { | |
aAdditionalParts = 0; | |
} | |
} else { | |
// count off the number of requested domains. | |
begin = aHostname.get(); | |
iter = eTLD; | |
while (1) { | |
if (iter == begin) | |
break; | |
if (*(--iter) == '.' && aAdditionalParts-- == 0) { | |
++iter; | |
++aAdditionalParts; | |
break; | |
} | |
} | |
} | |
if (aAdditionalParts != 0) | |
return NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS; | |
aBaseDomain = Substring(iter, end); | |
// add on the trailing dot, if applicable | |
if (trailingDot) | |
aBaseDomain.Append('.'); | |
return NS_OK; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment