Last active
May 25, 2023 01:53
-
-
Save chrisveness/bcb00eb717e6382c5608 to your computer and use it in GitHub Desktop.
Utf8 string encode/decode using regular expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Encodes multi-byte Unicode string into utf-8 multiple single-byte characters | |
* (BMP / basic multilingual plane only). | |
* | |
* Chars in range U+0080 - U+07FF are encoded in 2 chars, U+0800 - U+FFFF in 3 chars. | |
* | |
* Can be achieved in JavaScript by unescape(encodeURIComponent(str)), | |
* but this approach may be useful in other languages. | |
* | |
* @param {string} unicodeString - Unicode string to be encoded as UTF-8. | |
* @returns {string} UTF8-encoded string. | |
*/ | |
function utf8Encode(unicodeString) { | |
if (typeof unicodeString != 'string') throw new TypeError('parameter ‘unicodeString’ is not a string'); | |
const utf8String = unicodeString.replace( | |
/[\u0080-\u07ff]/g, // U+0080 - U+07FF => 2 bytes 110yyyyy, 10zzzzzz | |
function(c) { | |
var cc = c.charCodeAt(0); | |
return String.fromCharCode(0xc0 | cc>>6, 0x80 | cc&0x3f); } | |
).replace( | |
/[\u0800-\uffff]/g, // U+0800 - U+FFFF => 3 bytes 1110xxxx, 10yyyyyy, 10zzzzzz | |
function(c) { | |
var cc = c.charCodeAt(0); | |
return String.fromCharCode(0xe0 | cc>>12, 0x80 | cc>>6&0x3F, 0x80 | cc&0x3f); } | |
); | |
return utf8String; | |
} | |
/** | |
* Decodes utf-8 encoded string back into multi-byte Unicode characters. | |
* | |
* Can be achieved JavaScript by decodeURIComponent(escape(str)), | |
* but this approach may be useful in other languages. | |
* | |
* @param {string} utf8String - UTF-8 string to be decoded back to Unicode. | |
* @returns {string} Decoded Unicode string. | |
*/ | |
function utf8Decode(utf8String) { | |
if (typeof utf8String != 'string') throw new TypeError('parameter ‘utf8String’ is not a string'); | |
// note: decode 3-byte chars first as decoded 2-byte strings could appear to be 3-byte char! | |
const unicodeString = utf8String.replace( | |
/[\u00e0-\u00ef][\u0080-\u00bf][\u0080-\u00bf]/g, // 3-byte chars | |
function(c) { // (note parentheses for precedence) | |
var cc = ((c.charCodeAt(0)&0x0f)<<12) | ((c.charCodeAt(1)&0x3f)<<6) | ( c.charCodeAt(2)&0x3f); | |
return String.fromCharCode(cc); } | |
).replace( | |
/[\u00c0-\u00df][\u0080-\u00bf]/g, // 2-byte chars | |
function(c) { // (note parentheses for precedence) | |
var cc = (c.charCodeAt(0)&0x1f)<<6 | c.charCodeAt(1)&0x3f; | |
return String.fromCharCode(cc); } | |
); | |
return unicodeString; | |
} |
Hi, nice work, but you don't handle Codepoints above 'U+FFFF' correctly, see https://github.com/TSlivede/utf8-regex-encode-decode-js/
@DhamoR If your String only contains ASCII and octal UTF-8 sequences you can do this:
str="\\320\\223...";
Utf8Decode(str.replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))
If your string can already contain UTF-16 chars you need to first encode those into UTF-8:
str="\\320\\223"+String.fromCodePoint(0x1F60E)+"...";
Utf8Decode(Utf8Encode(str).replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))
this does not support emoji :(
Excellent work !
Sorry, touch of a newbie question here. When I try to use the code, I get errors on lines 22/43 with the function(c); call. Can someone tell me where to get this piece of code? Thanks!
I need to make it url by adding -
muito bom.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, This is a brilliant piece of code. I have a string of octal representation of the utf8string like("\320\223...") that I want to decode to unicode String.. How do I do that..?