RavenHursT/url-extract-root-domain.js

ComradeVanti · 2023-11-19T17:39:12Z

@innocentamadi that also does not work depending on how many ending segments the domain has. For www.fhstp.ac.at (my school domain), it would only give you ac.at

DebapriyaSengupta28 · 2024-02-20T18:50:58Z

this works -

function extractDomain(url) {
// Remove protocol if exists
let domain = url.replace(/^https?:///i, '');

// Remove www. if exists
domain = domain.replace(/^www\./i, '');

// Get the hostname from the URL
try {
    domain = new URL('http://' + domain).hostname;
} catch (error) {
    // If there's an error in URL parsing, return the original domain
    return domain;
}

// Extract subdomains
const parts = domain.split('.');
if (parts.length > 2) {
    // Check if the last part is a TLD (Top Level Domain)
    if (parts[parts.length - 1].length <= 3) {
        // Handles cases like co.uk, com.au, etc.
        domain = parts.slice(-3).join('.');
    } else {
        domain = parts.slice(-2).join('.');
    }
}

// Add www. prefix back if it exists in the original URL
if (url.includes('www.')) {
    domain = 'www.' + domain;
}

return domain;

}

// Test cases
console.log(extractDomain("https://studio.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g/editing/sections")); // Output: studio.youtube.com
console.log(extractDomain("https://www.youtube.com/")); // Output: www.youtube.com
console.log(extractDomain("https://www.youtube.com/channel/UCntj-iDUfMBvc8_peZWbQ4g")); // Output: www.youtube.com

pesseyjulien · 2024-04-11T13:16:57Z

thanks

ldhasson · 2024-12-27T15:34:31Z

Is this the answer though for "root" domains? I implemented the following which seems to work in all cases i am aware of:

/**
Returns the root domain assuming that if there are two dots, the root domain are the last 2 parts, i.e., xyz.com for a URL such as www.xyz.com.
If the url has more parts, e.g. www.xyz.co.jp, then assumed the last 3 pieces are the root domain, i.e., xyz.co.jp.
*/
function getRootDomain(url)
      {
        try
         {
            let d = new URL(url).hostname;
            let dots = d.split("\\.");
            if (dots.length < 3)
             return d;
            if (dots.length == 3)
             return dots[1]+"."+dots[2];
            return dots[dots.length-3]+"."+dots[dots.length-2]+"."+dots[dots.length-1];
         }
        catch (error)
         {
           console.error("Invalid URL:", error);
           return null;
         }
      }

RavenHursT · 2024-12-27T16:24:42Z

I wrote this years ago..

At this point, I would either look for a library to solve this problem or just ask AI..

Here's what Grok spit out:

function getRootDomain(url) {
    // Remove any protocol and www from the URL
    url = url.replace(/^(?:https?:\/\/)?(?:www\.)?/i, "");

    // Split the domain into parts
    const parts = url.split('.');

    // If the domain has only one or two parts, return the whole URL
    if (parts.length <= 2) {
        return url;
    }

    // List of known TLDs (this should be kept updated or fetched dynamically)
    const knownTlds = ['com', 'org', 'edu', 'gov', 'net', 'co', 'io', 'uk', 'jp', 'au', 'ca', 'de', 'fr', 'it', 'es', 'cn', 'in', 'br', 'ru', 'nl', 'se', 'dk', 'no', 'fi', 'ch', 'at', 'be', 'pl', 'ie', 'nz', 'sg', 'kr', 'tw', 'hk', 'my', 'za', 'il', 'mx', 'tr', 'id', 'th', 'vn', 'ph', 'gr', 'cz', 'hu', 'pt', 'ro', 'sk', 'si', 'bg', 'hr', 'lt', 'lv', 'ee', 'is', 'li', 'lu', 'mc', 'mt', 'cy', 'sm', 'va', 'ad', 'ae', 'af', 'al', 'am', 'ao', 'ar', 'az', 'ba', 'bd', 'bf', 'bh', 'bi', 'bj', 'bn', 'bo', 'bw', 'by', 'bz', 'cd', 'cf', 'cg', 'cl', 'cm', 'cn', 'co', 'cr', 'cu', 'cv', 'cy', 'cz', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'er', 'et', 'fj', 'fm', 'ga', 'ge', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gt', 'gu', 'gw', 'gy', 'hk', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'in', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'ss', 'st', 'sv', 'sx', 'sy', 'sz', 'tc', 'td', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw'];

    // Function to check if the last part is a known TLD
    const isKnownTld = (tld) => knownTlds.includes(tld.toLowerCase());

    // Check if the domain ends with a known TLD or a combination that includes the second last part
    if (parts.length > 3 && (isKnownTld(parts[parts.length - 1]) || knownTlds.some(tld => url.endsWith(`.${tld}`)))) {
        return parts.slice(-3).join('.');
    } else {
        return parts.slice(-2).join('.');
    }
}

// Test cases
console.log(getRootDomain("www.xyz.com"));        // Outputs: xyz.com
console.log(getRootDomain("blog.www.xyz.co.jp")); // Outputs: xyz.co.jp
console.log(getRootDomain("example.co.uk"));      // Outputs: example.co.uk
console.log(getRootDomain("site.com"));           // Outputs: site.com
console.log(getRootDomain("example.edu.au"));     // Outputs: example.edu.au
console.log(getRootDomain("example.website"));    // Outputs: example.website

sudosoul · 2025-05-13T18:55:34Z

@RavenHursT Except the test cases didn't output what it said it would, but actually printed:

"xyz.com"
"xyz.co.jp"
"co.uk"
"site.com"
"edu.au"
"example.website"

RavenHursT/url-extract-root-domain.js

ComradeVanti commented Nov 19, 2023

Uh oh!

DebapriyaSengupta28 commented Feb 20, 2024

Uh oh!

pesseyjulien commented Apr 11, 2024

Uh oh!

ldhasson commented Dec 27, 2024

Uh oh!

RavenHursT commented Dec 27, 2024

Uh oh!

sudosoul commented May 13, 2025

Uh oh!

	var extractRootDomain = function(url){
	return url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]\|$)/i)[1].split('.').slice(-2).join('.');
	};