Created
January 14, 2020 04:45
-
-
Save biancadanforth/87e7df167a3ce33e47aaf605a1379e02 to your computer and use it in GitHub Desktop.
Benchmarking RegExp vs URL class for domain matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Motivation: For a list of domains, is RegExp domain matching or URL class domain matching | |
* more performant in Firefox's JS engine? I used the website ``jsben.ch`` to measure. | |
* Spoiler: The URL class approach is 11% faster. Can this approach work with the | |
* ``matchSubdomains`` option? | |
*/ | |
// Setup block | |
const domains = ["nytimes.com", "www.npr.org"]; | |
const url = "https://www.nytimes.com/2020/01/13/us/politics/russian-hackers-burisma-ukraine.html?action=click&module=Top%20Stories&pgtype=Homepage"; | |
// Boilerplate block | |
/** | |
* This module provides utilities for matching URLs against domain names. | |
*/ | |
/** | |
* Class for testing whether a URL matches a set of domains. | |
* Currently implemented with the native RegExp over the full URL, which gives good performance. | |
* We might be able to speed this up by parsing the URL and then only matching domains. | |
*/ | |
class UrlMatcher { | |
/** | |
* Create a URL matcher. | |
* @param {string[]} domains - The set of domains to match against. | |
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set. | |
*/ | |
constructor(domains, matchSubdomains = true) { | |
this.regExp = new RegExp(createUrlRegexString(domains, matchSubdomains)); | |
} | |
/** | |
* Test whether a URL matches a domain in the set of domains. | |
* @param {string} url - The URL to test. | |
*/ | |
testUrl(url) { | |
return this.regExp.test(url); | |
} | |
} | |
/** | |
* Generate a regular expression string for matching a URL against a set of domains. | |
* Will match http and https protocols. Currently case sensitive. | |
* @param {string[]} domains - The set of domains to match against. | |
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set. | |
* @returns {string} A regular expression string. | |
*/ | |
function createUrlRegexString(domains, matchSubdomains = true) { | |
var urlMatchRE = "^(?:http|https)://" + (matchSubdomains ? "(?:[A-Za-z0-9\\-]+\\.)*" : "") + "(?:"; | |
for (const domain of domains) | |
urlMatchRE = urlMatchRE + domain.replace(/\./g, "\\.") + "|"; | |
urlMatchRE = urlMatchRE.substring(0, urlMatchRE.length - 1) + ")(?:$|/.*)"; | |
return urlMatchRE; | |
} | |
/** | |
* Generate an array of match patterns for matching a URL against a set of domains. | |
* Will match http and https protocols. | |
* @param {string[]} domains - The set of domains to match against. | |
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set. | |
* @returns {string[]} An array of match patterns. | |
*/ | |
function createUrlMatchPatternArray(domains, matchSubdomains = true) { | |
var matchPatterns = [ ]; | |
for (const domain of domains) { | |
matchPatterns.push("http://" + ( matchSubdomains ? "*." : "" ) + domain + "/*"); | |
matchPatterns.push("https://" + ( matchSubdomains ? "*." : "" ) + domain + "/*"); | |
} | |
return matchPatterns; | |
} | |
// RegExp domain matching (code block 1) | |
const urlMatcher = new UrlMatcher(domains); | |
urlMatcher.testUrl(url); | |
// URL class domain matching (code block 2) | |
function checkForMatch(url) { | |
const urlObj = new URL(url); | |
for (const domain of domains) { | |
if ((urlObj.protocol === 'http:' || urlObj.protocol === 'https:') && urlObj.host.endsWith(domain)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
checkForMatch(url); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment