-
-
Save RavenHursT/fe8a95a59109096ac1f8 to your computer and use it in GitHub Desktop.
var extractRootDomain = function(url){ | |
return url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i)[1].split('.').slice(-2).join('.'); | |
}; |
extractRootDomain("localhost")
Uncaught TypeError: Cannot read property '1' of null at extractRootDomain (<anonymous>:2:61) at <anonymous>:1:1
Probably better to change this to parse the given url first:
const parsed = new URL(window.location.href)
Then just do string and array manipulation based on what kinds of domains you're working w/ (-2
for .com
's, -3
for .co.uk
's, etc.)
parsed.hostname.split('.').slice(-2).join('.')
not working on domain like "http://google.co.jp"
Does not work with https://www.mesdroitssociaux.gouv.fr/accueil/
Does not work with
https://www.mesdroitssociaux.gouv.fr/accueil/
You can try this one: https://github.com/scrapingapi/get-root-domain
Does not work with
https://www.mesdroitssociaux.gouv.fr/accueil/
You can try this one: https://github.com/scrapingapi/get-root-domain
Merci, mais c'est la même chose ... It would be more simpler and more reliable to use a SLD list as a base and apply a regex on the url by parsing the SDL list.
We just need to have a repo with a list which will be always up to date.
A one-liner:
const getRoot = (url = "") => (new URL(url)).hostname.split('.').slice(-2).join('.')
@innocentamadi that also does not work depending on how many ending segments the domain has. For www.fhstp.ac.at
(my school domain), it would only give you ac.at
this works -
function extractDomain(url) {
// Remove protocol if exists
let domain = url.replace(/^https?:///i, '');
// Remove www. if exists
domain = domain.replace(/^www\./i, '');
// Get the hostname from the URL
try {
domain = new URL('http://' + domain).hostname;
} catch (error) {
// If there's an error in URL parsing, return the original domain
return domain;
}
// Extract subdomains
const parts = domain.split('.');
if (parts.length > 2) {
// Check if the last part is a TLD (Top Level Domain)
if (parts[parts.length - 1].length <= 3) {
// Handles cases like co.uk, com.au, etc.
domain = parts.slice(-3).join('.');
} else {
domain = parts.slice(-2).join('.');
}
}
// Add www. prefix back if it exists in the original URL
if (url.includes('www.')) {
domain = 'www.' + domain;
}
return domain;
}
// Test cases
console.log(extractDomain("https://studio.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g/editing/sections")); // Output: studio.youtube.com
console.log(extractDomain("https://www.youtube.com/")); // Output: www.youtube.com
console.log(extractDomain("https://www.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g")); // Output: www.youtube.com
thanks
Is this the answer though for "root" domains? I implemented the following which seems to work in all cases i am aware of:
/**
Returns the root domain assuming that if there are two dots, the root domain are the last 2 parts, i.e., xyz.com for a URL such as www.xyz.com.
If the url has more parts, e.g. www.xyz.co.jp, then assumed the last 3 pieces are the root domain, i.e., xyz.co.jp.
*/
function getRootDomain(url)
{
try
{
let d = new URL(url).hostname;
let dots = d.split("\\.");
if (dots.length < 3)
return d;
if (dots.length == 3)
return dots[1]+"."+dots[2];
return dots[dots.length-3]+"."+dots[dots.length-2]+"."+dots[dots.length-1];
}
catch (error)
{
console.error("Invalid URL:", error);
return null;
}
}
I wrote this years ago..
At this point, I would either look for a library to solve this problem or just ask AI..
Here's what Grok spit out:
function getRootDomain(url) {
// Remove any protocol and www from the URL
url = url.replace(/^(?:https?:\/\/)?(?:www\.)?/i, "");
// Split the domain into parts
const parts = url.split('.');
// If the domain has only one or two parts, return the whole URL
if (parts.length <= 2) {
return url;
}
// List of known TLDs (this should be kept updated or fetched dynamically)
const knownTlds = ['com', 'org', 'edu', 'gov', 'net', 'co', 'io', 'uk', 'jp', 'au', 'ca', 'de', 'fr', 'it', 'es', 'cn', 'in', 'br', 'ru', 'nl', 'se', 'dk', 'no', 'fi', 'ch', 'at', 'be', 'pl', 'ie', 'nz', 'sg', 'kr', 'tw', 'hk', 'my', 'za', 'il', 'mx', 'tr', 'id', 'th', 'vn', 'ph', 'gr', 'cz', 'hu', 'pt', 'ro', 'sk', 'si', 'bg', 'hr', 'lt', 'lv', 'ee', 'is', 'li', 'lu', 'mc', 'mt', 'cy', 'sm', 'va', 'ad', 'ae', 'af', 'al', 'am', 'ao', 'ar', 'az', 'ba', 'bd', 'bf', 'bh', 'bi', 'bj', 'bn', 'bo', 'bw', 'by', 'bz', 'cd', 'cf', 'cg', 'cl', 'cm', 'cn', 'co', 'cr', 'cu', 'cv', 'cy', 'cz', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'er', 'et', 'fj', 'fm', 'ga', 'ge', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gt', 'gu', 'gw', 'gy', 'hk', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'in', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'ss', 'st', 'sv', 'sx', 'sy', 'sz', 'tc', 'td', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw'];
// Function to check if the last part is a known TLD
const isKnownTld = (tld) => knownTlds.includes(tld.toLowerCase());
// Check if the domain ends with a known TLD or a combination that includes the second last part
if (parts.length > 3 && (isKnownTld(parts[parts.length - 1]) || knownTlds.some(tld => url.endsWith(`.${tld}`)))) {
return parts.slice(-3).join('.');
} else {
return parts.slice(-2).join('.');
}
}
// Test cases
console.log(getRootDomain("www.xyz.com")); // Outputs: xyz.com
console.log(getRootDomain("blog.www.xyz.co.jp")); // Outputs: xyz.co.jp
console.log(getRootDomain("example.co.uk")); // Outputs: example.co.uk
console.log(getRootDomain("site.com")); // Outputs: site.com
console.log(getRootDomain("example.edu.au")); // Outputs: example.edu.au
console.log(getRootDomain("example.website")); // Outputs: example.website
extractRootDomain('http://www.google.co.uk/blah')
"co.uk"