jimniels · February 3, 2025 02:38
diff --git a/parse-feed-urls.js b/parse-feed-urls.js
 // The idea axplained here:
 // https://blog.jim-nielsen.com/2021/feed-urls/
 // deno run --allow-net parse-feed-urls.js
 import { DOMParser } from "https://deno.land/x/[email protected]/deno-dom-wasm.ts";
 import * as path from "https://deno.land/[email protected]/path/mod.ts";
 import { parse } from "https://cdn.skypack.dev/tldts";

 const urls = await getFeedUrls();
 const urlCollection = createUrlCollection(urls);

 console.log("\nURLs parsed: %s\n", urlCollection.length);

 log(
  "Resource name",
  getTop5ByKey(urlCollection, "name")
    .map(([name, count], i) => `${i + 1}. ${name}: ${count}`)
    .join("\n")
 );

 log(
  `Resource location`,
  "Root /*: " + urlCollection.filter(({ dir }) => dir === "/").length,
  "Nested /**/*: " + urlCollection.filter(({ dir }) => dir !== "/").length
 );

 console.log();

 const urlCollectionWithExt = urlCollection.filter(({ ext }) => ext);
 log(
  `Resource with an extension: ${urlCollectionWithExt.length}`,
  getTop5ByKey(urlCollectionWithExt, "ext")
    .map(
      ([ext, count], i) =>
        `${i + 1}. *${ext}: ${count}\n` +
        getTop5ByKey(
          urlCollectionWithExt.filter((u) => u.ext === ext),
          "base"
        )
          .map(([base, count]) => `   ${base}: ${count}`)
          .join("\n")
    )
    .join("\n")
 );

 const urlCollectionWithoutExt = urlCollection.filter(({ ext }) => !ext);
 log(
  `Resource without an extension: ${urlCollectionWithoutExt.length}`,
  getTop5ByKey(urlCollectionWithoutExt, "base")
    .map(([base, count], i) => `${i + 1}. /${base}/: ${count}`)
    .join("\n")
 );

 log(
  "Domains",
  getTop5ByKey(urlCollection, "domain")
    .map(([domain, count], i) => `${i + 1}. ${domain}: ${count}`)
    .join("\n")
 );

 /**
 * @typdef UrlCollection
 * @param {string} url - The original URL pulled from the XML
 * @param {string} domain - The domain derived from the URL
 * @param {string} base
 * @param {string} dir
 * @param {string} ext
 * @param {string} name
 *
 * base, dir, ext, and name are all extracted from Node's `parse()`
 * Here’s how two example urls would break down:
 *   1. `/path/to/feed.xml`
 *   2. `/feed/`
 * base 1) `feed.xml` 2) `feed`
 * ext  1) `.xml`     2) ``
 * dir  1) `/path/to` 2) `/`
 * name 1) `feed`     2) `feed`
 */

 /**
 * Fetch the feed URLs we need
 * @returns {Array.<string>}
 */
 async function getFeedUrls() {
  return fetch(
    "https://raw.githubusercontent.com/simevidas/web-dev-feeds/master/feeds.opml"
  )
    .then((res) => res.text())
    .then((text) => {
      // Should be parsing this as 'application/xml' but it's not supported yet
      // Parsing as HTML works for our purposes now tho.
      const doc = new DOMParser().parseFromString(text, "text/html");
      let urls = [];
      doc.querySelectorAll("[xmlurl]").forEach((url) => {
        urls.push(url.getAttribute("xmlurl"));
      });
      return urls;
    })
    .catch((e) => {
      console.error(e);
    });
 }

 /**
 * Take some URLs and turn them into a collection of data we can count on
 * @param {Array.<string>} urls
 * @returns {UrlCollection}
 */
 function createUrlCollection(urls) {
  return urls.map((url) => {
    const { pathname, origin } = new URL(url);
    const { base, dir, ext, name } = path.parse(pathname);

    return {
      url,
      domain: parse(origin).domain,
      base,
      dir,
      ext,
      name,
    };
  });
 }

 /**
 * Take some info and log it in a consistent pattern with a new line at the end
 * @param {string} title
 * @param  {Array.<string>} args
 */
 function log(title, ...args) {
  const divider = "----------------------------";
  console.log(title);

  console.log(divider);
  console.log(args.join("\n"));

  console.log();
 }

 /**
 * Get the top 5 pieces of data within a URLCollection and return them back
 * in an array of [value, count].
 * @param {UrlCollection} collection
 * @param {string} key
 * @returns
 */
 function getTop5ByKey(collection, key) {
  return Object.entries(
    collection.reduce((acc, url) => {
      const value = url[key];
      if (acc[value]) {
        acc[value] += 1;
      } else {
        acc[value] = 1;
      }
      return acc;
    }, {})
  )
    .sort((a, b) => b[1] - a[1])
    .slice(0, 5);
 }
	// The idea axplained here:
	// https://blog.jim-nielsen.com/2021/feed-urls/
	// deno run --allow-net parse-feed-urls.js
	import { DOMParser } from "https://deno.land/x/[email protected]/deno-dom-wasm.ts";
	import * as path from "https://deno.land/[email protected]/path/mod.ts";
	import { parse } from "https://cdn.skypack.dev/tldts";

	const urls = await getFeedUrls();
	const urlCollection = createUrlCollection(urls);

	console.log("\nURLs parsed: %s\n", urlCollection.length);

	log(
	"Resource name",
	getTop5ByKey(urlCollection, "name")
	.map(([name, count], i) => `${i + 1}. ${name}: ${count}`)
	.join("\n")
	);

	log(
	`Resource location`,
	"Root /*: " + urlCollection.filter(({ dir }) => dir === "/").length,
	"Nested /*/: " + urlCollection.filter(({ dir }) => dir !== "/").length
	);

	console.log();

	const urlCollectionWithExt = urlCollection.filter(({ ext }) => ext);
	log(
	`Resource with an extension: ${urlCollectionWithExt.length}`,
	getTop5ByKey(urlCollectionWithExt, "ext")
	.map(
	([ext, count], i) =>
	`${i + 1}. *${ext}: ${count}\n` +
	getTop5ByKey(
	urlCollectionWithExt.filter((u) => u.ext === ext),
	"base"
	)
	.map(([base, count]) => ` ${base}: ${count}`)
	.join("\n")
	)
	.join("\n")
	);

	const urlCollectionWithoutExt = urlCollection.filter(({ ext }) => !ext);
	log(
	`Resource without an extension: ${urlCollectionWithoutExt.length}`,
	getTop5ByKey(urlCollectionWithoutExt, "base")
	.map(([base, count], i) => `${i + 1}. /${base}/: ${count}`)
	.join("\n")
	);

	log(
	"Domains",
	getTop5ByKey(urlCollection, "domain")
	.map(([domain, count], i) => `${i + 1}. ${domain}: ${count}`)
	.join("\n")
	);

	/**
	* @typdef UrlCollection
	* @param {string} url - The original URL pulled from the XML
	* @param {string} domain - The domain derived from the URL
	* @param {string} base
	* @param {string} dir
	* @param {string} ext
	* @param {string} name
	*
	* base, dir, ext, and name are all extracted from Node's `parse()`
	* Here’s how two example urls would break down:
	* 1. `/path/to/feed.xml`
	* 2. `/feed/`
	* base 1) `feed.xml` 2) `feed`
	* ext 1) `.xml` 2) ``
	* dir 1) `/path/to` 2) `/`
	* name 1) `feed` 2) `feed`
	*/

	/**
	* Fetch the feed URLs we need
	* @returns {Array.<string>}
	*/
	async function getFeedUrls() {
	return fetch(
	"https://raw.githubusercontent.com/simevidas/web-dev-feeds/master/feeds.opml"
	)
	.then((res) => res.text())
	.then((text) => {
	// Should be parsing this as 'application/xml' but it's not supported yet
	// Parsing as HTML works for our purposes now tho.
	const doc = new DOMParser().parseFromString(text, "text/html");
	let urls = [];
	doc.querySelectorAll("[xmlurl]").forEach((url) => {
	urls.push(url.getAttribute("xmlurl"));
	});
	return urls;
	})
	.catch((e) => {
	console.error(e);
	});
	}

	/**
	* Take some URLs and turn them into a collection of data we can count on
	* @param {Array.<string>} urls
	* @returns {UrlCollection}
	*/
	function createUrlCollection(urls) {
	return urls.map((url) => {
	const { pathname, origin } = new URL(url);
	const { base, dir, ext, name } = path.parse(pathname);

	return {
	url,
	domain: parse(origin).domain,
	base,
	dir,
	ext,
	name,
	};
	});
	}

	/**
	* Take some info and log it in a consistent pattern with a new line at the end
	* @param {string} title
	* @param {Array.<string>} args
	*/
	function log(title, ...args) {
	const divider = "----------------------------";
	console.log(title);

	console.log(divider);
	console.log(args.join("\n"));

	console.log();
	}

	/**
	* Get the top 5 pieces of data within a URLCollection and return them back
	* in an array of [value, count].
	* @param {UrlCollection} collection
	* @param {string} key
	* @returns
	*/
	function getTop5ByKey(collection, key) {
	return Object.entries(
	collection.reduce((acc, url) => {
	const value = url[key];
	if (acc[value]) {
	acc[value] += 1;
	} else {
	acc[value] = 1;
	}
	return acc;
	}, {})
	)
	.sort((a, b) => b[1] - a[1])
	.slice(0, 5);
	}