Created
February 23, 2015 22:58
-
-
Save RavenHursT/fe8a95a59109096ac1f8 to your computer and use it in GitHub Desktop.
Javascript extract root domain from URL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var extractRootDomain = function(url){ | |
return url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i)[1].split('.').slice(-2).join('.'); | |
}; |
thanks
Is this the answer though for "root" domains? I implemented the following which seems to work in all cases i am aware of:
/**
Returns the root domain assuming that if there are two dots, the root domain are the last 2 parts, i.e., xyz.com for a URL such as www.xyz.com.
If the url has more parts, e.g. www.xyz.co.jp, then assumed the last 3 pieces are the root domain, i.e., xyz.co.jp.
*/
function getRootDomain(url)
{
try
{
let d = new URL(url).hostname;
let dots = d.split("\\.");
if (dots.length < 3)
return d;
if (dots.length == 3)
return dots[1]+"."+dots[2];
return dots[dots.length-3]+"."+dots[dots.length-2]+"."+dots[dots.length-1];
}
catch (error)
{
console.error("Invalid URL:", error);
return null;
}
}
I wrote this years ago..
At this point, I would either look for a library to solve this problem or just ask AI..
Here's what Grok spit out:
function getRootDomain(url) {
// Remove any protocol and www from the URL
url = url.replace(/^(?:https?:\/\/)?(?:www\.)?/i, "");
// Split the domain into parts
const parts = url.split('.');
// If the domain has only one or two parts, return the whole URL
if (parts.length <= 2) {
return url;
}
// List of known TLDs (this should be kept updated or fetched dynamically)
const knownTlds = ['com', 'org', 'edu', 'gov', 'net', 'co', 'io', 'uk', 'jp', 'au', 'ca', 'de', 'fr', 'it', 'es', 'cn', 'in', 'br', 'ru', 'nl', 'se', 'dk', 'no', 'fi', 'ch', 'at', 'be', 'pl', 'ie', 'nz', 'sg', 'kr', 'tw', 'hk', 'my', 'za', 'il', 'mx', 'tr', 'id', 'th', 'vn', 'ph', 'gr', 'cz', 'hu', 'pt', 'ro', 'sk', 'si', 'bg', 'hr', 'lt', 'lv', 'ee', 'is', 'li', 'lu', 'mc', 'mt', 'cy', 'sm', 'va', 'ad', 'ae', 'af', 'al', 'am', 'ao', 'ar', 'az', 'ba', 'bd', 'bf', 'bh', 'bi', 'bj', 'bn', 'bo', 'bw', 'by', 'bz', 'cd', 'cf', 'cg', 'cl', 'cm', 'cn', 'co', 'cr', 'cu', 'cv', 'cy', 'cz', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'er', 'et', 'fj', 'fm', 'ga', 'ge', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gt', 'gu', 'gw', 'gy', 'hk', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'in', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'ss', 'st', 'sv', 'sx', 'sy', 'sz', 'tc', 'td', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw'];
// Function to check if the last part is a known TLD
const isKnownTld = (tld) => knownTlds.includes(tld.toLowerCase());
// Check if the domain ends with a known TLD or a combination that includes the second last part
if (parts.length > 3 && (isKnownTld(parts[parts.length - 1]) || knownTlds.some(tld => url.endsWith(`.${tld}`)))) {
return parts.slice(-3).join('.');
} else {
return parts.slice(-2).join('.');
}
}
// Test cases
console.log(getRootDomain("www.xyz.com")); // Outputs: xyz.com
console.log(getRootDomain("blog.www.xyz.co.jp")); // Outputs: xyz.co.jp
console.log(getRootDomain("example.co.uk")); // Outputs: example.co.uk
console.log(getRootDomain("site.com")); // Outputs: site.com
console.log(getRootDomain("example.edu.au")); // Outputs: example.edu.au
console.log(getRootDomain("example.website")); // Outputs: example.website
@RavenHursT Except the test cases didn't output what it said it would, but actually printed:
"xyz.com"
"xyz.co.jp"
"co.uk"
"site.com"
"edu.au"
"example.website"
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
this works -
function extractDomain(url) {
// Remove protocol if exists
let domain = url.replace(/^https?:///i, '');
}
// Test cases
console.log(extractDomain("https://studio.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g/editing/sections")); // Output: studio.youtube.com
console.log(extractDomain("https://www.youtube.com/")); // Output: www.youtube.com
console.log(extractDomain("https://www.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g")); // Output: www.youtube.com