Last active
December 19, 2015 21:49
-
-
Save hayes/6022839 to your computer and use it in GitHub Desktop.
convert a string to a byte array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function to_bytes(str) { | |
var arr = [] | |
, byte_count | |
, padding | |
, binary | |
, prefix | |
, bytes | |
, code | |
for(var i = 0, len = str.length; i < len; ++i) { | |
code = fixedCharCodeAt(str, i) | |
if(code === false) { | |
// non BMP chars will have a length of 2 | |
// if you try to get the charCode of the second element, it will be false | |
continue | |
} | |
// if it is a multibyte character | |
if(code > 0x7F) { | |
binary = code.toString(2) | |
// each aditional byte gives 5 additional usable bits | |
byte_count = Math.ceil((binary.length - 1) / 5) | |
// create leading 1s that represent number of bytes used to represent this character | |
prefix = new Array(byte_count + 1).join('1') | |
// create leading 0s to fill before the begining of the code point | |
padding = new Array(((byte_count - 1) * 6) + (9 - prefix.length) - binary.length) | |
binary = padding.join('0') + binary | |
// fill the remaining bits in the leading byte | |
prefix += binary.slice(0, 8 - byte_count) | |
binary = binary.slice(8 - byte_count) | |
// each byte has room for six bits from the code point | |
bytes = [prefix].concat(binary.match(/.{6}/g)) | |
for(var j = 0, jlen = bytes.length; j < jlen; ++j) { | |
// ORing with 0x80 ensure that the non leading bytes are formated as 10xxxxxx | |
arr[arr.length] = parseInt(bytes[j], 2) | 0x80 | |
} | |
} else { | |
arr[arr.length] = code | |
} | |
} | |
return arr | |
} | |
// from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt#Example_2.3A_Fixing_charCodeAt_to_handle_non-Basic-Multilingual-Plane_characters_if_their_presence_earlier_in_the_string_is_unknown | |
// a version of charCodeAt that also handles non-Basic-Multilingual-Plane characters | |
function fixedCharCodeAt(str, idx) { | |
idx = idx || 0 | |
var code = str.charCodeAt(idx) | |
, low | |
, hi | |
if(0xD800 <= code && code <= 0xDBFF) { | |
low = str.charCodeAt(idx+1) | |
hi = code | |
if(isNaN(low)) { | |
throw new Error('High surrogate not followed by low surrogate in fixedCharCodeAt()') | |
} | |
return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000 | |
} | |
if(0xDC00 <= code && code <= 0xDFFF) { | |
return false | |
} | |
return code | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment