-
-
Save bytespider/1007834 to your computer and use it in GitHub Desktop.
// http://en.wikipedia.org/wiki/UTF-8 | |
function stringToByteArray(a,b,c,d,e,f,g,h,i){ | |
for( | |
b = [ | |
e = | |
d = 0 | |
] // initialise variables | |
; | |
c = a.charCodeAt(d++) // get the character code from the string | |
; | |
){ | |
g = 128; | |
c < g // under 128 is UTF-8 (ASCII range), 1 byte | |
? | |
b[e] = c // add to byte array | |
: | |
c < 2048 // under 2048 2bytes | |
? | |
f = 1 // number of bytes left to process | |
: | |
c < 65536 // under 65536 is 3bytes | |
? | |
f = 2 // number of bytes left to process | |
: | |
c < 2<<20 ? // finally, under 2097152 is 4bytes | |
f = 3 // 3bytes left to process | |
: | |
0 | |
; | |
for( // process the remaining bytes indicated by `f` | |
h = e++, | |
i = f | |
; | |
f-- > 0 // -1 and check if greater than 0 still | |
; | |
b[h] = g + (2<<(6-i)) + (c >> i*6) // move onto the next slot in the byte array | |
) | |
b[e++] = g + (c >> f*6 & 63) // shift f * 6 bits, mask 1byte and add 128 | |
} | |
return b // return the byte array | |
} |
function(a,b,c,d,e,f,g,h,i){for(b=[e=d=0];c=a.charCodeAt(d++);){g=128;c<g?b[e]=c:c<2048?f=1:c<65536?f=2:c<2<<20?f=3:0;for(h=e++,i=f;f-- >0;b[h]=g+(2<<6-i)+(c>>i*6))b[e++]=g+(c>>f*6&63)}return b} |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2011 YOUR_NAME_HERE <YOUR_URL_HERE> | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. |
{ | |
"name": "stringToByteArray", | |
"description": "Convert a string of characters to an array of UTF-8 bytes", | |
"keywords": [ | |
"cryptography", | |
"utf8" | |
] | |
} |
function stringToByteArray(str) { | |
var bytes = [], code, i; | |
for(i = 0; i < str.length; i++) { | |
code = str.charCodeAt(i); | |
if (code < 128) { | |
bytes.push(code); | |
} else if (code < 2048) { | |
bytes.push(192+(code>>6), 128+(code&63)); | |
} else if (code < 65536) { | |
bytes.push(224+(code>>12), 128+((code>>6)&63), 128+(code&63)); | |
} else if (code < 2097152) { | |
bytes.push(240+(code>>18), 128+((code>>12)&63), 128+((code>>6)&63), 128+(code&63)); | |
} | |
} | |
return bytes; | |
} |
<!DOCTYPE html> | |
<title>stringToByteArray</title> | |
<div>Expected value: <b id="ret"></b></div> | |
<div>Actual value: <b id="ret2"></b></div> | |
<script> | |
// write a small example that shows off the API for your example | |
// and tests it in one fell swoop. | |
function stringToByteArrayOld(str) { | |
var bytes = [], code, i; | |
for(i = 0; i < str.length; i++) { | |
code = str.charCodeAt(i); | |
if (code < 128) { | |
bytes.push(code); | |
} else if (code < 2048) { | |
bytes.push(192+(code>>6), 128+(code&63)); | |
} else if (code < 65536) { | |
bytes.push(224+(code>>12), 128+((code>>6)&63), 128+(code&63)); | |
} else if (code < 2097152) { | |
bytes.push(240+(code>>18), 128+((code>>12)&63), 128+((code>>6)&63), 128+(code&63)); | |
} | |
} | |
return bytes; | |
} | |
function stringToByteArray(a,b,c,d,e,f,g,h,i){for(b=[e=d=0];c=a.charCodeAt(d++);){g=128;c<g?b[e]=c:c<2048?f=1:c<65536?f=2:c<2<<20?f=3:0;for(h=e++,i=f;f-- >0;b[h]=g+(2<<6-i)+(c>>i*6))b[e++]=g+(c>>f*6&63)}return b}; | |
document.getElementById( "ret" ).innerHTML = stringToByteArrayOld("hello☺䭢 it works") | |
document.getElementById( "ret2" ).innerHTML = stringToByteArray("hello☺䭢 it works") | |
</script> |
I don’t get it. Are you confusing ^
with Math.pow()
, e.g. doesn’t 2^21
equal 23
instead of 2097152
?
You're right, I am. Damn, I'll have to recheck my tests
FWIW, the demo doesn’t seem to work for me.
Something like encodeURIComponent("Привет мир").split('%')) may help you with non-ascii utf8
@subzey: I'll try that route, but its possible that may make things more complicated
i thought about using >>= to shift of 6 bits at a time leaving only the bits required for the next operation
Hey, got it down to 202 bytes by refactoring the complex b[e] calcs to happen in the for loop... the comments are all mangled and nonsense now, but that shaves off about 10 bytes. I'm sure there are more savings in there with some trickery: https://gist.github.com/1008218
Hey, I see this has been updated! You adding similar refactors to what I just did! looking for more...
Yes I took my working code and applied what I could see the major changes from yours were to find out what bits were breaking.
looks like you can drop the parens around (f=3), no?
No you'll get and Invalid left hand side assignment error. just realised my demo also fails
also, since you're no longer modifying g, instead of doing "g =128; c < g ...", you can do "c < g=128 .." I believe...
Oh, same problem :-\
Also, it looks like the compressed version is different from the annotated version...
compressed has: "b[h]=(2<<5-f)+(c>>6_i)"
annotated has: "b[h] = (2<<(5-f)) + (c >> 6_i)"
compressed reveals the truth: multiplication and addition/subtraction have higher precedence than bitwise shifts
Oh so i've been chasing a none existant bug!? Damn!
another byte gone:
old:
"c<2<<20&&(f=3);"
new
"c<2<<20?f=3:0;"
FOUND THE BUG: g_4 is not 2048 :-.
old:
"c<g_4"
new:
"c<2048"
deleted
updated with fix: https://gist.github.com/1008438
Can get it to 184bytes if we assume that we'll only have 4byte characters
Could save 4 bytes by doing "f=c<2048?1:..." instead of "c<2048?f=1:...". Also another 2 bytes if you do "i=f=c<2048..." and get rid of ",i=f"
Another byte: "65536" -> "g<<9"
@bytespider https://gist.github.com/1008764 the encodeURIComponent based version. It's 132 bytes in length (wrapped) and works well.
hey @bytespider, could you take the trailing comma out of your package.json
keywords?
Looks good, but ...
2^16
is not 65536. line 40 could be optimized, I guess. Your annotated code misses a ; before the second for loop.