Last active
December 31, 2020 10:22
-
-
Save LouisdeBruijn/1cc8403a76dae8b2f3e25e95fdd370ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import html | |
| def unescape_html( | |
| text: str) -> str: | |
| """Converts any HTML entities found in text to their textual representation. | |
| :param text: utterance that may contain HTML entities | |
| :type text: str | |
| Example of HTML entities found during annotations | |
| html_entities = [(" ", " ") | |
| , ("&", "&") | |
| , (">", ">") | |
| , ("<", "<") | |
| , ("≤", "≤") | |
| , ("≥", "≥")] | |
| :return: utterance wihtout HTML entities | |
| :rtype: str | |
| """ | |
| return html.unescape(text) | |
| s = "Ik wil de te naamstelling van mijn betaalrekening & pas aanpassen Mej. \u2014-> Mw." | |
| json_dumped_s = json.dumps(unescape_html(s)) | |
| print(json_dumped_s) | |
| >>> "Ik wil de te naamstelling van \u00a0 mijn betaalrekening & pas aanpassen Mej. \u2014-> Mw." | |
| print(json.loads(json_dumped_s)) | |
| >>> Ik wil de te naamstelling van mijn betaalrekening & pas aanpassen Mej. —-> Mw. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment