Last active
January 31, 2023 19:12
-
-
Save jimniels/073beba189ff69ca1b9ef93dc4228a26 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The idea axplained here: | |
// https://blog.jim-nielsen.com/2021/feed-urls/ | |
// deno run --allow-net parse-feed-urls.js | |
import { DOMParser } from "https://deno.land/x/[email protected]/deno-dom-wasm.ts"; | |
import * as path from "https://deno.land/[email protected]/path/mod.ts"; | |
import { parse } from "https://cdn.skypack.dev/tldts"; | |
const urls = await getFeedUrls(); | |
const urlCollection = createUrlCollection(urls); | |
console.log("\nURLs parsed: %s\n", urlCollection.length); | |
log( | |
"Resource name", | |
getTop5ByKey(urlCollection, "name") | |
.map(([name, count], i) => `${i + 1}. ${name}: ${count}`) | |
.join("\n") | |
); | |
log( | |
`Resource location`, | |
"Root /*: " + urlCollection.filter(({ dir }) => dir === "/").length, | |
"Nested /**/*: " + urlCollection.filter(({ dir }) => dir !== "/").length | |
); | |
console.log(); | |
const urlCollectionWithExt = urlCollection.filter(({ ext }) => ext); | |
log( | |
`Resource with an extension: ${urlCollectionWithExt.length}`, | |
getTop5ByKey(urlCollectionWithExt, "ext") | |
.map( | |
([ext, count], i) => | |
`${i + 1}. *${ext}: ${count}\n` + | |
getTop5ByKey( | |
urlCollectionWithExt.filter((u) => u.ext === ext), | |
"base" | |
) | |
.map(([base, count]) => ` ${base}: ${count}`) | |
.join("\n") | |
) | |
.join("\n") | |
); | |
const urlCollectionWithoutExt = urlCollection.filter(({ ext }) => !ext); | |
log( | |
`Resource without an extension: ${urlCollectionWithoutExt.length}`, | |
getTop5ByKey(urlCollectionWithoutExt, "base") | |
.map(([base, count], i) => `${i + 1}. /${base}/: ${count}`) | |
.join("\n") | |
); | |
log( | |
"Domains", | |
getTop5ByKey(urlCollection, "domain") | |
.map(([domain, count], i) => `${i + 1}. ${domain}: ${count}`) | |
.join("\n") | |
); | |
/** | |
* @typdef UrlCollection | |
* @param {string} url - The original URL pulled from the XML | |
* @param {string} domain - The domain derived from the URL | |
* @param {string} base | |
* @param {string} dir | |
* @param {string} ext | |
* @param {string} name | |
* | |
* base, dir, ext, and name are all extracted from Node's `parse()` | |
* Here’s how two example urls would break down: | |
* 1. `/path/to/feed.xml` | |
* 2. `/feed/` | |
* base 1) `feed.xml` 2) `feed` | |
* ext 1) `.xml` 2) `` | |
* dir 1) `/path/to` 2) `/` | |
* name 1) `feed` 2) `feed` | |
*/ | |
/** | |
* Fetch the feed URLs we need | |
* @returns {Array.<string>} | |
*/ | |
async function getFeedUrls() { | |
return fetch( | |
"https://raw.githubusercontent.com/simevidas/web-dev-feeds/master/feeds.opml" | |
) | |
.then((res) => res.text()) | |
.then((text) => { | |
// Should be parsing this as 'application/xml' but it's not supported yet | |
// Parsing as HTML works for our purposes now tho. | |
const doc = new DOMParser().parseFromString(text, "text/html"); | |
let urls = []; | |
doc.querySelectorAll("[xmlurl]").forEach((url) => { | |
urls.push(url.getAttribute("xmlurl")); | |
}); | |
return urls; | |
}) | |
.catch((e) => { | |
console.error(e); | |
}); | |
} | |
/** | |
* Take some URLs and turn them into a collection of data we can count on | |
* @param {Array.<string>} urls | |
* @returns {UrlCollection} | |
*/ | |
function createUrlCollection(urls) { | |
return urls.map((url) => { | |
const { pathname, origin } = new URL(url); | |
const { base, dir, ext, name } = path.parse(pathname); | |
return { | |
url, | |
domain: parse(origin).domain, | |
base, | |
dir, | |
ext, | |
name, | |
}; | |
}); | |
} | |
/** | |
* Take some info and log it in a consistent pattern with a new line at the end | |
* @param {string} title | |
* @param {Array.<string>} args | |
*/ | |
function log(title, ...args) { | |
const divider = "----------------------------"; | |
console.log(title); | |
console.log(divider); | |
console.log(args.join("\n")); | |
console.log(); | |
} | |
/** | |
* Get the top 5 pieces of data within a URLCollection and return them back | |
* in an array of [value, count]. | |
* @param {UrlCollection} collection | |
* @param {string} key | |
* @returns | |
*/ | |
function getTop5ByKey(collection, key) { | |
return Object.entries( | |
collection.reduce((acc, url) => { | |
const value = url[key]; | |
if (acc[value]) { | |
acc[value] += 1; | |
} else { | |
acc[value] = 1; | |
} | |
return acc; | |
}, {}) | |
) | |
.sort((a, b) => b[1] - a[1]) | |
.slice(0, 5); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment