-
-
Save bytespider/1007834 to your computer and use it in GitHub Desktop.
// http://en.wikipedia.org/wiki/UTF-8 | |
function stringToByteArray(a,b,c,d,e,f,g,h,i){ | |
for( | |
b = [ | |
e = | |
d = 0 | |
] // initialise variables | |
; | |
c = a.charCodeAt(d++) // get the character code from the string | |
; | |
){ | |
g = 128; | |
c < g // under 128 is UTF-8 (ASCII range), 1 byte | |
? | |
b[e] = c // add to byte array | |
: | |
c < 2048 // under 2048 2bytes | |
? | |
f = 1 // number of bytes left to process | |
: | |
c < 65536 // under 65536 is 3bytes | |
? | |
f = 2 // number of bytes left to process | |
: | |
c < 2<<20 ? // finally, under 2097152 is 4bytes | |
f = 3 // 3bytes left to process | |
: | |
0 | |
; | |
for( // process the remaining bytes indicated by `f` | |
h = e++, | |
i = f | |
; | |
f-- > 0 // -1 and check if greater than 0 still | |
; | |
b[h] = g + (2<<(6-i)) + (c >> i*6) // move onto the next slot in the byte array | |
) | |
b[e++] = g + (c >> f*6 & 63) // shift f * 6 bits, mask 1byte and add 128 | |
} | |
return b // return the byte array | |
} |
function(a,b,c,d,e,f,g,h,i){for(b=[e=d=0];c=a.charCodeAt(d++);){g=128;c<g?b[e]=c:c<2048?f=1:c<65536?f=2:c<2<<20?f=3:0;for(h=e++,i=f;f-- >0;b[h]=g+(2<<6-i)+(c>>i*6))b[e++]=g+(c>>f*6&63)}return b} |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2011 YOUR_NAME_HERE <YOUR_URL_HERE> | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. |
{ | |
"name": "stringToByteArray", | |
"description": "Convert a string of characters to an array of UTF-8 bytes", | |
"keywords": [ | |
"cryptography", | |
"utf8" | |
] | |
} |
function stringToByteArray(str) { | |
var bytes = [], code, i; | |
for(i = 0; i < str.length; i++) { | |
code = str.charCodeAt(i); | |
if (code < 128) { | |
bytes.push(code); | |
} else if (code < 2048) { | |
bytes.push(192+(code>>6), 128+(code&63)); | |
} else if (code < 65536) { | |
bytes.push(224+(code>>12), 128+((code>>6)&63), 128+(code&63)); | |
} else if (code < 2097152) { | |
bytes.push(240+(code>>18), 128+((code>>12)&63), 128+((code>>6)&63), 128+(code&63)); | |
} | |
} | |
return bytes; | |
} |
<!DOCTYPE html> | |
<title>stringToByteArray</title> | |
<div>Expected value: <b id="ret"></b></div> | |
<div>Actual value: <b id="ret2"></b></div> | |
<script> | |
// write a small example that shows off the API for your example | |
// and tests it in one fell swoop. | |
function stringToByteArrayOld(str) { | |
var bytes = [], code, i; | |
for(i = 0; i < str.length; i++) { | |
code = str.charCodeAt(i); | |
if (code < 128) { | |
bytes.push(code); | |
} else if (code < 2048) { | |
bytes.push(192+(code>>6), 128+(code&63)); | |
} else if (code < 65536) { | |
bytes.push(224+(code>>12), 128+((code>>6)&63), 128+(code&63)); | |
} else if (code < 2097152) { | |
bytes.push(240+(code>>18), 128+((code>>12)&63), 128+((code>>6)&63), 128+(code&63)); | |
} | |
} | |
return bytes; | |
} | |
function stringToByteArray(a,b,c,d,e,f,g,h,i){for(b=[e=d=0];c=a.charCodeAt(d++);){g=128;c<g?b[e]=c:c<2048?f=1:c<65536?f=2:c<2<<20?f=3:0;for(h=e++,i=f;f-- >0;b[h]=g+(2<<6-i)+(c>>i*6))b[e++]=g+(c>>f*6&63)}return b}; | |
document.getElementById( "ret" ).innerHTML = stringToByteArrayOld("hello☺䭢 it works") | |
document.getElementById( "ret2" ).innerHTML = stringToByteArray("hello☺䭢 it works") | |
</script> |
looks like you can drop the parens around (f=3), no?
No you'll get and Invalid left hand side assignment error. just realised my demo also fails
also, since you're no longer modifying g, instead of doing "g =128; c < g ...", you can do "c < g=128 .." I believe...
Oh, same problem :-\
Also, it looks like the compressed version is different from the annotated version...
compressed has: "b[h]=(2<<5-f)+(c>>6_i)"
annotated has: "b[h] = (2<<(5-f)) + (c >> 6_i)"
compressed reveals the truth: multiplication and addition/subtraction have higher precedence than bitwise shifts
Oh so i've been chasing a none existant bug!? Damn!
another byte gone:
old:
"c<2<<20&&(f=3);"
new
"c<2<<20?f=3:0;"
FOUND THE BUG: g_4 is not 2048 :-.
old:
"c<g_4"
new:
"c<2048"
deleted
updated with fix: https://gist.github.com/1008438
Can get it to 184bytes if we assume that we'll only have 4byte characters
Could save 4 bytes by doing "f=c<2048?1:..." instead of "c<2048?f=1:...". Also another 2 bytes if you do "i=f=c<2048..." and get rid of ",i=f"
Another byte: "65536" -> "g<<9"
@bytespider https://gist.github.com/1008764 the encodeURIComponent based version. It's 132 bytes in length (wrapped) and works well.
hey @bytespider, could you take the trailing comma out of your package.json
keywords?
Yes I took my working code and applied what I could see the major changes from yours were to find out what bits were breaking.