Last active
May 13, 2020 10:05
-
-
Save Oldes/01fb15df1e7d2abd04ed83ebdaedb0cb to your computer and use it in GitHub Desktop.
HTML entities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Red [ | |
Title: "HTML entities" | |
Purpose: "To decode HTML entities in a text" | |
Author: "Oldes" | |
Date: 12-May-2020 | |
Version: 1.0.2 | |
License: MIT | |
Usage: [ | |
"Test: ♠ & ¢ <a> and Δδ ¾" = | |
decode-html-entities {Test: ♠ & ¢ <a> and Δδ ¾} | |
] | |
TODO: {Encoder?} | |
] | |
html-entities: #( | |
;@@ https://eastmanreference.com/list-of-html-entity-names-and-numbers | |
;-- Punctuation, programming, and other common symbols | |
"lt" #"^(3C)" ; 60 Open tag | |
"gt" #"^(3E)" ; 62 Close tag | |
"quot" #"^(22)" ; 34 Double quote | |
"apos" #"^(27)" ; 39 Apostrophe / single quote | |
"amp" #"^(26)" ; 38 Ampersand | |
"nbsp" #"^(A0)" ; 160 Space (non-breaking) | |
"brvbar" #"^(A6)" ; 166 Broken bar | |
"iexcl" #"^(A1)" ; 161 Upside down exclamation mark | |
"iquest" #"^(BF)" ; 191 Upside down question mark | |
"sect" #"^(A7)" ; 167 Section symbol | |
"uml" #"^(A8)" ; 168 Umlaut | |
"ordf" #"^(AA)" ; 170 Feminine ordinal indicator | |
"ordm" #"^(BA)" ; 186 Masculine ordinal indicator | |
"laquo" #"^(AB)" ; 171 Open double angles | |
"raquo" #"^(BB)" ; 187 Close double angles | |
"not" #"^(AC)" ; 172 Not sign | |
"shy" #"^(AD)" ; 173 Soft hyphen | |
"macr" #"^(AF)" ; 175 Overline | |
"acute" #"^(B4)" ; 180 Acute accent | |
"para" #"^(B6)" ; 182 Pilcrow (paragraph) | |
"middot" #"^(B7)" ; 183 Georgian comma | |
"cedil" #"^(B8)" ; 184 Cedilla | |
;-- Math symbols | |
"minus" #"^(2212)" ; 8722 Minus sign (subtraction) | |
"times" #"^(D7)" ; 215 Multiplication sign | |
"divide" #"^(F7)" ; 247 Division sign | |
"plusmn" #"^(B1)" ; 177 Plus / minus | |
"le" #"^(2264)" ; 8804 Less or equal | |
"ge" #"^(2265)" ; 8805 Greater or equal | |
"sup1" #"^(B9)" ; 185 Superscript 1 | |
"sup2" #"^(B2)" ; 178 Superscript 2 | |
"sup3" #"^(B3)" ; 179 Superscript 3 | |
"frac14" #"^(BC)" ; 188 1/4 | |
"frac12" #"^(BD)" ; 189 1/2 | |
"frac34" #"^(BE)" ; 190 3/4 | |
"forall" #"^(2200)" ; 8704 For all | |
"part" #"^(2202)" ; 8706 Part | |
"exist" #"^(2203)" ; 8707 Exist | |
"empty" #"^(2205)" ; 8709 Empty | |
"nabla" #"^(2207)" ; 8711 Nabla | |
"isin" #"^(2208)" ; 8712 Is in | |
"notin" #"^(2209)" ; 8713 Not in | |
"ni" #"^(220B)" ; 8715 Ni | |
"prod" #"^(220F)" ; 8719 Product | |
"sum" #"^(2211)" ; 8721 Sum | |
"lowast" #"^(2217)" ; 8727 Asterisk (Lowast) | |
"radic" #"^(221A)" ; 8730 Square root | |
"prop" #"^(221D)" ; 8733 Proportional to | |
"infin" #"^(221E)" ; 8734 Infinity | |
"ang" #"^(2220)" ; 8736 Angle | |
"and" #"^(2227)" ; 8743 And | |
"or" #"^(2228)" ; 8744 Or | |
"cap" #"^(2229)" ; 8745 Cap | |
"cup" #"^(222A)" ; 8746 Cup | |
"int" #"^(222B)" ; 8747 Integral | |
"there4" #"^(2234)" ; 8756 Therefore | |
"sim" #"^(223C)" ; 8764 Similar to | |
"cong" #"^(2245)" ; 8773 Congurent to | |
"asymp" #"^(2248)" ; 8776 Almost equal | |
"ne" #"^(2260)" ; 8800 Not equal | |
"equiv" #"^(2261)" ; 8801 Equivalent | |
"sub" #"^(2282)" ; 8834 Subset of | |
"sup" #"^(2283)" ; 8835 Superset of | |
"nsub" #"^(2284)" ; 8836 Not subset of | |
"sube" #"^(2286)" ; 8838 Subset or equal | |
"supe" #"^(2287)" ; 8839 Superset or equal | |
"oplus" #"^(2295)" ; 8853 Circled plus | |
"otimes" #"^(2297)" ; 8855 Circled times | |
;-- Unit of measure symbols | |
"deg" #"^(B0)" ; 176 Degrees | |
"micro" #"^(B5)" ; 181 Micro | |
;-- Copyright, registered, trademark | |
"copy" #"^(A9)" ; 169 Copyright | |
"reg" #"^(AE)" ; 174 Registered trademark | |
"trade" #"^(2122)" ; 8482 Trademark | |
;-- Currency symbols | |
"curren" #"^(A4)" ; 164 Currency sign | |
"cent" #"^(A2)" ; 162 Cents | |
"pound" #"^(A3)" ; 163 British pounds | |
"euro" #"^(20AC)" ; 8364 Euro | |
"yen" #"^(A5)" ; 165 Yen | |
;-- Greek alphabet | |
"Alpha" #"^(391)" ; 913 UPPERCASE ALPHA | |
"alpha" #"^(3B1)" ; 945 lowercase alpha | |
"Beta" #"^(392)" ; 914 UPPERCASE BETA | |
"beta" #"^(3B2)" ; 946 lowercase beta | |
"Gamma" #"^(393)" ; 915 UPPERCASE GAMMA | |
"gamma" #"^(3B3)" ; 947 lowercase gamma | |
"Delta" #"^(394)" ; 916 UPPERCASE DELTA | |
"delta" #"^(3B4)" ; 948 lowercase delta | |
"Epsilon" #"^(395)" ; 917 UPPERCASE EPSILON | |
"epsilon" #"^(3B5)" ; 949 lowercase epsilon | |
"Zeta" #"^(396)" ; 918 UPPERCASE ZETA | |
"zeta" #"^(3B6)" ; 950 lowercase zeta | |
"Eta" #"^(397)" ; 919 UPPERCASE ETA | |
"eta" #"^(3B7)" ; 951 lowercase eta | |
"Theta" #"^(398)" ; 920 UPPERCASE THETA | |
"theta" #"^(3B8)" ; 952 lowercase theta | |
"thetasym" #"^(3D1)" ; 977 alternate lowercase theta | |
"Iota" #"^(399)" ; 921 UPPERCASE IOTA | |
"iota" #"^(3B9)" ; 953 lowercase iota | |
"Kappa" #"^(39A)" ; 922 UPPERCASE KAPPA | |
"kappa" #"^(3BA)" ; 954 lowercase kappa | |
"Lambda" #"^(39B)" ; 923 UPPERCASE LAMBDA | |
"lambda" #"^(3BB)" ; 955 lowercase lambda | |
"Mu" #"^(39C)" ; 924 UPPERCASE MU | |
"mu" #"^(3BC)" ; 956 lowercase mu | |
"Nu" #"^(39D)" ; 925 UPPERCASE NU | |
"nu" #"^(3BD)" ; 957 lowercase nu | |
"Xi" #"^(39E)" ; 926 UPPERCASE XI | |
"xi" #"^(3BE)" ; 958 lowercase xi | |
"Omicron" #"^(39F)" ; 927 UPPERCASE OMICRON | |
"omicron" #"^(3BF)" ; 959 lowercase omicron | |
"Pi" #"^(3A0)" ; 928 UPPERCASE PI | |
"pi" #"^(3C0)" ; 960 lowercase pi | |
"piv" #"^(3D6)" ; 982 alternative lowercase pi | |
"Rho" #"^(3A1)" ; 929 UPPERCASE RHO | |
"rho" #"^(3C1)" ; 961 lowercase rho | |
"Sigma" #"^(3A3)" ; 931 UPPERCASE SIGMA | |
"sigma" #"^(3C3)" ; 963 lowercase sigma | |
"sigmaf" #"^(3C2)" ; 962 final form lowercase sigma | |
"Tau" #"^(3A4)" ; 932 UPPERCASE TAU | |
"tau" #"^(3C4)" ; 964 lowercase tau | |
"Upsilon" #"^(3A5)" ; 933 UPPERCASE UPSILON | |
"upsilon" #"^(3C5)" ; 965 lowercase upsilon | |
"upsih" #"^(3D2)" ; 978 alternative lowercase upsilon | |
"Phi" #"^(3A6)" ; 934 UPPERCASE PHI | |
"phi" #"^(3C6)" ; 966 lowercase phi | |
"Chi" #"^(3A7)" ; 935 UPPERCASE CHI | |
"chi" #"^(3C7)" ; 967 lowercase chi | |
"Psi" #"^(3A8)" ; 936 UPPERCASE PSI | |
"psi" #"^(3C8)" ; 968 lowercase psi | |
"Omega" #"^(3A9)" ; 937 UPPERCASE OMEGA | |
"omega" #"^(3C9)" ; 969 lowercase omega | |
;-- Arrows | |
"larr" #"^(2190)" ; 8592 Left arrow | |
"uarr" #"^(2191)" ; 8593 Up arrow | |
"rarr" #"^(2192)" ; 8594 Right arrow | |
"darr" #"^(2193)" ; 8595 Down arrow | |
"harr" #"^(2194)" ; 8596 Left & right arrow | |
"crarr" #"^(21B5)" ; 8629 Carriage return arrow | |
;-- Spade, club, heart, diamond | |
"spades" #"^(2660)" ; 9824 Spade | |
"clubs" #"^(2663)" ; 9827 Club | |
"hearts" #"^(2665)" ; 9829 Heart | |
"diams" #"^(2666)" ; 9830 Diamond | |
;-- Accented letters | |
"Agrave" #"^(C0)" ; 192 CAPITAL A GRAVE ACCENT | |
"agrave" #"^(E0)" ; 224 lowercase a grave accent | |
"Aacute" #"^(C1)" ; 193 CAPITAL A ACUTE ACCENT | |
"aacute" #"^(E1)" ; 225 lowercase a acute accent | |
"Acirc" #"^(C2)" ; 194 CAPITAL A CIRCUMFLEX ACCENT | |
"acirc" #"^(E2)" ; 226 lowercase a circumflex accent | |
"Atilde" #"^(C3)" ; 195 CAPITAL A TILDE ACCENT | |
"atilde" #"^(E3)" ; 227 lowercase a tilde accent | |
"Auml" #"^(C4)" ; 196 CAPITAL A UMLAUT ACCENT | |
"auml" #"^(E4)" ; 228 lowercase a umlaut accent | |
"Aring" #"^(C5)" ; 197 CAPITAL A RING ABOVE ACCENT | |
"aring" #"^(E5)" ; 229 lowercase a ring accent | |
"AElig" #"^(C6)" ; 198 CAPITAL AE | |
"aelig" #"^(E6)" ; 230 lowercase ae | |
"Ccedil" #"^(C7)" ; 199 CAPITAL C CEDILLA ACCENT | |
"ccedil" #"^(E7)" ; 231 lowercase c cedilla accent | |
"Egrave" #"^(C8)" ; 200 CAPITAL E GRAVE ACCENT | |
"egrave" #"^(E8)" ; 232 lowercase e grave accent | |
"Eacute" #"^(C9)" ; 201 CAPITAL E ACUTE ACCENT | |
"eacute" #"^(E9)" ; 233 lowercase e acute accent | |
"Ecirc" #"^(CA)" ; 202 CAPITAL E CIRCUMFLEX ACCENT | |
"ecirc" #"^(EA)" ; 234 lowercase e circumflex accent | |
"ecirc" #"^(EA)" ; 234 lowercase e circumflex accent | |
"Euml" #"^(CB)" ; 203 CAPITAL E UMLAUT ACCENT | |
"euml" #"^(EB)" ; 235 lowercase e umlaut accent | |
"Igrave" #"^(CC)" ; 204 CAPITAL I GRAVE ACCENT | |
"igrave" #"^(EC)" ; 236 lowercase i grave accent | |
"Iacute" #"^(CD)" ; 205 CAPITAL I ACUTE ACCENT | |
"iacute" #"^(ED)" ; 237 lowercase i acute accent | |
"Icirc" #"^(CE)" ; 206 CAPITAL I CIRCUMFLEX ACCENT | |
"icirc" #"^(EE)" ; 238 lowercase i circumflex accent | |
"Iuml" #"^(CF)" ; 207 CAPITAL I UMLAUT ACCENT | |
"iuml" #"^(EF)" ; 239 lowercase i umlaut accent | |
"ETH" #"^(D0)" ; 208 CAPITAL ICELANDIC ETH | |
"eth" #"^(F0)" ; 240 lowercase Icelandic eth | |
"Ntilde" #"^(D1)" ; 209 CAPITAL N TILDE ACCENT | |
"ntilde" #"^(F1)" ; 241 lowercase n tilde accent | |
"Ograve" #"^(D2)" ; 210 CAPITAL O GRAVE ACCENT | |
"ograve" #"^(F2)" ; 242 lowercase o grave accent | |
"Oacute" #"^(D3)" ; 211 CAPITAL O ACUTE ACCENT | |
"oacute" #"^(F3)" ; 243 lowercase o acute accent | |
"Ocirc" #"^(D4)" ; 212 CAPITAL O CIRCUMFLEX ACCENT | |
"ocirc" #"^(F4)" ; 244 lowercase o circumflex accent | |
"Otilde" #"^(D5)" ; 213 CAPITAL O TILDE ACCENT | |
"otilde" #"^(F5)" ; 245 lowercase o tilde accent | |
"Ouml" #"^(D6)" ; 214 CAPITAL O UMLAUT ACCENT | |
"ouml" #"^(F6)" ; 246 lowercase o umlaut accent | |
"Oslash" #"^(D8)" ; 216 CAPITAL O SLASH ACCENT | |
"oslash" #"^(F8)" ; 248 lowercase o slash | |
"Ugrave" #"^(D9)" ; 217 CAPITAL U GRAVE ACCENT | |
"ugrave" #"^(F9)" ; 249 lowercase u grave accent | |
"Uacute" #"^(DA)" ; 218 CAPITAL U ACUTE ACCENT | |
"uacute" #"^(FA)" ; 250 lowercase u acute accent | |
"Ucirc" #"^(DB)" ; 219 CAPITAL U CIRCUMFLEX ACCENT | |
"ucirc" #"^(FB)" ; 251 lowercase u circumflex accent | |
"Uuml" #"^(DC)" ; 220 CAPITAL U UMLAUT | |
"uuml" #"^(FC)" ; 252 lowercase u umlaut accent | |
"Yacute" #"^(DD)" ; 221 CAPITAL Y ACUTE ACCENT | |
"yacute" #"^(FD)" ; 253 lowercase y acute accent | |
"yuml" #"^(FF)" ; 255 lowercase y umlaut accent | |
"THORN" #"^(DE)" ; 222 CAPITAL ICELANDIC THORN | |
"thorn" #"^(FE)" ; 254 lowercase Icelandic thorn | |
"szlig" #"^(DF)" ; 223 lowercase German sharp s | |
;-- Miscellaneous | |
"bull" #"^(2022)" ; 8226 Bullet | |
"hellip" #"^(2026)" ; 8230 Horizontal ellipsis | |
"fnof" #"^(192)" ; 402 lowercase Latin f with hook | |
"perp" #"^(22A5)" ; 8869 Perpendicular | |
"sdot" #"^(22C5)" ; 8901 Dot operator | |
"OElig" #"^(152)" ; 338 UPPERCASE LATIN OE LIGATURE | |
"oelig" #"^(153)" ; 339 lowercase Latin oe ligature | |
"Scaron" #"^(160)" ; 352 UPPERCASE S WITH CARON | |
"scaron" #"^(161)" ; 353 lowercase s with caron | |
"Yuml" #"^(178)" ; 376 CAPITAL Y WITH DIAERES | |
"circ" #"^(2C6)" ; 710 Circumflex accent | |
"tilde" #"^(2DC)" ; 732 Tilde (different from the tilde my keyboard generates) | |
"ndash" #"^(2013)" ; 8211 En dash | |
"mdash" #"^(2014)" ; 8212 Em dash | |
"lsquo" #"^(2018)" ; 8216 Left single quotation mark | |
"rsquo" #"^(2019)" ; 8217 Right single quotation mark | |
"sbquo" #"^(201A)" ; 8218 Single low-9 quotation mark | |
"ldquo" #"^(201C)" ; 8220 Left double quotation mark | |
"rdquo" #"^(201D)" ; 8221 Right double quotation mark | |
"bdquo" #"^(201E)" ; 8222 Double low-9 quotation mark | |
"dagger" #"^(2020)" ; 8224 Dagger | |
"Dagger" #"^(2021)" ; 8225 Double dagger | |
"permil" #"^(2030)" ; 8240 Per mille | |
"prime" #"^(2032)" ; 8242 Minutes (Degrees) | |
"Prime" #"^(2033)" ; 8243 Seconds (Degrees) | |
"lsaquo" #"^(2039)" ; 8249 Single left angle quotation | |
"rsaquo" #"^(2039)" ; 8249 Single right angle quotation | |
"oline" #"^(203E)" ; 8254 Overline | |
"lceil" #"^(2308)" ; 8968 Left ceiling | |
"rceil" #"^(2309)" ; 8969 Right ceiling | |
"lfloor" #"^(230A)" ; 8970 Left floor | |
"rfloor" #"^(230B)" ; 8971 Right floor | |
"loz" #"^(25CA)" ; 9674 Lozenge | |
"ensp" #"^(2002)" ; 8194 En space | |
"emsp" #"^(2003)" ; 8195 Em space | |
"thinsp" #"^(2009)" ; 8201 Thin space | |
"zwnj" #"^(200C)" ; 8204 Zero width non-joiner | |
"zwj" #"^(200D)" ; 8205 Zero width joiner | |
"lrm" #"^(200E)" ; 8206 Left-to-right mark | |
"rlm" #"^(200F)" ; 8207 Right-to-left mark | |
) | |
context [ | |
any-except-&: complement charset "&" | |
alphanum: charset [#"0" - #"9" #"a" - #"z" #"A" - #"Z"] | |
digits: charset [#"0" - #"9"] | |
set 'decode-html-entities func [ | |
{Creates a new string with possible HTML entities converted to chars} | |
val [string!] {Input string} | |
/local out s e char | |
][ | |
out: make string! length? val | |
parse val [ | |
any [ | |
s: some any-except-& e: ( append/part out s e ) | |
| #"&" [ | |
#"#" copy char 1 4 digits #";" ( | |
append out to char! to integer! char | |
) | |
| s: copy char 1 10 alphanum #";" e: ( | |
char: select/case html-entities char | |
unless char [ char: #"&" e: :s ] | |
append out char | |
) :e | |
| (append out #"&") | |
] | |
] | |
] | |
out | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment