Last active
August 29, 2015 14:18
-
-
Save JakobOvrum/b7cb2213af46d238a49e to your computer and use it in GitHub Desktop.
std.regex and HTML entities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import std.regex; | |
immutable string[string] namedEntities; | |
// Construct AA at startup | |
shared static this() | |
{ | |
namedEntities = [ | |
"gt": ">", | |
"lt": "<", | |
"acE": "\u223E\u0333" | |
// Generate this table from the HTML5 JSON document of named entities | |
// Probs best to put it in its own module that is auto-generated in its entirety | |
// Can also use string imports to parse the JSON at compile-time, but since the | |
// JSON document doesn't change, that's just way overkill | |
]; | |
} | |
string entityToUTF8(Captures!string capture) | |
{ | |
import std.algorithm : startsWith; | |
import std.conv : text, to; // text helps convert from dchar to string (single code point to UTF-8) | |
// capture[0] is whole match, capture[1] is first submatch (the part in parantheses) | |
auto submatch = capture[1]; | |
if(submatch.startsWith('#')) | |
return (cast(dchar)submatch[1 .. $].to!uint()).text(); | |
else | |
return namedEntities[submatch]; | |
} | |
unittest | |
{ | |
// Compile regex to machine code at CT | |
static entityPattern = ctRegex!`&([#a-zA-Z0-9]+);`; | |
auto text = "<3 'apostrophe' ∾̳"; | |
text = text.replaceAll!entityToUTF8(entityPattern); | |
assert(text == "<3 'apostrophe' \u223E\u0333"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment