fpapado · November 7, 2023 07:31 · florianbepunkt · Nov 7, 2023
diff --git a/0_Validating_URLs.md b/0_Validating_URLs.md
diff --git a/1_index.js b/1_index.js
 // Change to toggle debug logging
 const DEBUG = true;

 /**
 * Validate a subset of URLs, that:
 * - is a valid URL (and absolute, because we provide no base URL)
 * - has a TLD
 * - has a protocol of http: or https:
 *
 * NOTE: A URL not having a TLD or http(s) as the protocol does not make it an **invalid** URL. 
 * That is a deliberate restriction that our app imposes, on top of being a valid URL.
 *
 * @see https://url.spec.whatwg.org/#urls
 */
 function isAbsoluteHttpUrlWithTld(value) {
  let url;
  try {
    url = new URL(value);
  } catch (err) {
    // Invalid URL; maybe there is no protocol, or it is relative, or any of the many other possible reasons
    return {isAccepted: false, reason: 'Invalid URL'};
  }

  // Check whether the protocol is http or https, and reject otherwise
  if (url.protocol !== 'http:' && url.protocol !== 'https:') {
    return {isAccepted: false, reason: 'Protocol not http(s)'};
  }

  // Check if there is a TLD
  // (Not sure if this check offers anything in for machines, but it can be good to give hints to a user)
  if (url.origin.split('.').length === 1) {
    return {isAccepted: false, reason: 'No TLD'};
  }

  return {isAccepted: true, url};
 }

 // Place RegEx behind getter, to avoid state
 const getUrlRegEx = () => /^(http(s)?:\/\/)[\w-]+(\.[\w-]+)+(\S*)$/gim;

 const isUrlV0 = value => {
  if (typeof value === 'string' && value.match(getUrlRegEx())) {
    return true;
  }
  return false;
 };

 function main() {
  const acceptedUrls = [
    // Some "straightforward" cases
    'https://www.example.org',
    'https://subdomain.example.org',
    // A single slash is valid
    'http:/example.org',
    // Internationalisation
    // Non-ASCII characters are allowed, both in domain and TLD names
    // Under the hood, they are translated to an LDH representation
    // Concretely:
    //  http://παράδειγμα.δοκιμή -> http://xn--hxajbheg2az3al.xn--jxalpdlp/
    //  @see https://en.wikipedia.org/wiki/Punycode
    // The URL constructor handles this conversion already, even if not needed
    // The Greek example.com (registered with IANA, but sunset as of 2013)
    // @see https://en.wikipedia.org/wiki/IDN_Test_TLDs
    'http://παράδειγμα.δοκιμή',
    // Similar to the above, but TLD is in ASCII
    'http://παράδειγμα.gr',
    // The above, but written directly in LDH
    'http://xn--hxajbheg2az3al.xn--jxalpdlp/',
    // There's more things here, like ports, but out of scope
  ];

  const unacceptedUrls = [
    // Valid, but not accepted by our app
    './index.html', // Relative URL
    'http://example', // No TLD
    'ftp://example.org', // Unsupported protocol
    'myprotocol://whatever.com/yep', // Made up protocol
    // Invalid URLs
    '://example.org', // Missing protocol
    'bugfreefi', // All messed up
    'asd http://example.org', // Additional characters
    'http://example.org asd', // Additional characters
    'http//bugfree/path/to/here', // Missing colon
  ];

  // Validate the URLs with each candidate validator
  let candidates = [
    {label: 'Base', validator: isUrlV0},
    {label: 'Complete', validator: isAbsoluteHttpUrlWithTld},
  ];

  for (let {label, validator} of candidates) {
    const unacceptedRes = unacceptedUrls
      .map(validator)
      .map(debugLog)
      .filter(it => it.isAccepted === true);

    const acceptedRes = acceptedUrls
      .map(validator)
      .map(debugLog)
      .filter(it => it.isAccepted === true);

    console.log(
      `(${label}) Are all unacceptable URLs caught? ${unacceptedRes.length === 0}`,
    );

    console.log(
      `(${label}) Are all acceptable URLs ok? ${acceptedRes.length ===
        acceptedUrls.length}`,
    );
  }
 }

 function debugLog(val) {
  if (DEBUG === true) {
    console.log(val);
  }
  return val;
 }

 main();
	// Change to toggle debug logging
	const DEBUG = true;

	/**
	* Validate a subset of URLs, that:
	* - is a valid URL (and absolute, because we provide no base URL)
	* - has a TLD
	* - has a protocol of http: or https:
	*
	* NOTE: A URL not having a TLD or http(s) as the protocol does not make it an invalid URL.
	* That is a deliberate restriction that our app imposes, on top of being a valid URL.
	*
	* @see https://url.spec.whatwg.org/#urls
	*/
	function isAbsoluteHttpUrlWithTld(value) {
	let url;
	try {
	url = new URL(value);
	} catch (err) {
	// Invalid URL; maybe there is no protocol, or it is relative, or any of the many other possible reasons
	return {isAccepted: false, reason: 'Invalid URL'};
	}

	// Check whether the protocol is http or https, and reject otherwise
	if (url.protocol !== 'http:' && url.protocol !== 'https:') {
	return {isAccepted: false, reason: 'Protocol not http(s)'};
	}

	// Check if there is a TLD
	// (Not sure if this check offers anything in for machines, but it can be good to give hints to a user)
	if (url.origin.split('.').length === 1) {
	return {isAccepted: false, reason: 'No TLD'};
	}

	return {isAccepted: true, url};
	}

	// Place RegEx behind getter, to avoid state
	const getUrlRegEx = () => /^(http(s)?:\/\/)[\w-]+(\.[\w-]+)+(\S*)$/gim;

	const isUrlV0 = value => {
	if (typeof value === 'string' && value.match(getUrlRegEx())) {
	return true;
	}
	return false;
	};

	function main() {
	const acceptedUrls = [
	// Some "straightforward" cases
	'https://www.example.org',
	'https://subdomain.example.org',
	// A single slash is valid
	'http:/example.org',
	// Internationalisation
	// Non-ASCII characters are allowed, both in domain and TLD names
	// Under the hood, they are translated to an LDH representation
	// Concretely:
	// http://παράδειγμα.δοκιμή -> http://xn--hxajbheg2az3al.xn--jxalpdlp/
	// @see https://en.wikipedia.org/wiki/Punycode
	// The URL constructor handles this conversion already, even if not needed
	// The Greek example.com (registered with IANA, but sunset as of 2013)
	// @see https://en.wikipedia.org/wiki/IDN_Test_TLDs
	'http://παράδειγμα.δοκιμή',
	// Similar to the above, but TLD is in ASCII
	'http://παράδειγμα.gr',
	// The above, but written directly in LDH
	'http://xn--hxajbheg2az3al.xn--jxalpdlp/',
	// There's more things here, like ports, but out of scope
	];

	const unacceptedUrls = [
	// Valid, but not accepted by our app
	'./index.html', // Relative URL
	'http://example', // No TLD
	'ftp://example.org', // Unsupported protocol
	'myprotocol://whatever.com/yep', // Made up protocol
	// Invalid URLs
	'://example.org', // Missing protocol
	'bugfreefi', // All messed up
	'asd http://example.org', // Additional characters
	'http://example.org asd', // Additional characters
	'http//bugfree/path/to/here', // Missing colon
	];

	// Validate the URLs with each candidate validator
	let candidates = [
	{label: 'Base', validator: isUrlV0},
	{label: 'Complete', validator: isAbsoluteHttpUrlWithTld},
	];

	for (let {label, validator} of candidates) {
	const unacceptedRes = unacceptedUrls
	.map(validator)
	.map(debugLog)
	.filter(it => it.isAccepted === true);

	const acceptedRes = acceptedUrls
	.map(validator)
	.map(debugLog)
	.filter(it => it.isAccepted === true);

	console.log(
	`(${label}) Are all unacceptable URLs caught? ${unacceptedRes.length === 0}`,
	);

	console.log(
	`(${label}) Are all acceptable URLs ok? ${acceptedRes.length ===
	acceptedUrls.length}`,
	);
	}
	}

	function debugLog(val) {
	if (DEBUG === true) {
	console.log(val);
	}
	return val;
	}

	main();