Last active
December 9, 2021 21:16
-
-
Save T99/2a2a53d63ac9e24f6e3264283ef343ea to your computer and use it in GitHub Desktop.
Decode HTML entities in TypeScript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* A regular expression that matches HTML entities. | |
* - 1st Capture Group: The entire matched HTML entity. | |
* - 2nd Capture Group: The inner text of the entity (everything inside the '&' and ';' characters). | |
* - 3rd Capture Group: The integer value of the entity, if one is found (i.e. 'Ӓ' --> '1234'). | |
*/ | |
const HTML_ENTITY_REGEX: RegExp = /&(#([0-9]+)|[a-zA-Z0-9]+);/; | |
/** | |
* Returns the input string after having converted all recognized HTML entities to their respective unicode characters. | |
* | |
* @param {string} content The string (potentially) containing HTML entities to decode. | |
* @returns {string} The input string after having converted all recognized HTML entities to their respective unicode | |
* characters. | |
*/ | |
function decodeHTMLEntities(content: string): string { | |
/** | |
* If the given input string contains at least one recognized HTML entity, this function will return the unicode | |
* equivalent character for the aforementioned HTML entity. Otherwise, this function will simply return the original | |
* input. | |
* | |
* @param {string} input The string HTML entity to decode to its unicode equivalent character. | |
* @returns {string} The unicode equivalent character of the first HTML entity found in the input string, or the | |
* original input string if no such HTML entity can be found. | |
*/ | |
function decodeHTMLEntity(input: string): string { | |
// Look for an HTML entity in the provided string. | |
let entityMatch: RegExpMatchArray | null = input.match(HTML_ENTITY_REGEX); | |
// If we didn't find an HTML entity in the provided input, just return the original string. | |
if (entityMatch === null) return input; | |
let entity: string = entityMatch[0]; | |
// Try the <textarea> trick to get JSDOM's virtual DOM to decode the HTML entity for us. | |
let textArea: HTMLTextAreaElement = JSDOM.fragment("<textarea>").children[0] as HTMLTextAreaElement; | |
textArea.innerHTML = entity; | |
// The <textarea> trick successfully decoded the entity. | |
if (textArea.value !== entity) return textArea.value; | |
// The regex did not find an inner decimal value to use. | |
if ((entityMatch[1] === undefined) && (entityMatch[2] === undefined)) return input; | |
let parsedInt: number = parseInt(entityMatch[2] ?? entityMatch[1]); | |
// We could not parse the value as an integer, and therefore cannot parse the HTML entity. | |
if (isNaN(parsedInt)) return input; | |
return String.fromCharCode(parsedInt); | |
} | |
return content.replace(new RegExp(HTML_ENTITY_REGEX, "g"), decodeHTMLEntity); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment