-
-
Save enepomnyaschih/72c423f727d395eeaa09697058238727 to your computer and use it in GitHub Desktop.
/* | |
MIT License | |
Copyright (c) 2020 Egor Nepomnyaschih | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
*/ | |
/* | |
// This constant can also be computed with the following algorithm: | |
const base64abc = [], | |
A = "A".charCodeAt(0), | |
a = "a".charCodeAt(0), | |
n = "0".charCodeAt(0); | |
for (let i = 0; i < 26; ++i) { | |
base64abc.push(String.fromCharCode(A + i)); | |
} | |
for (let i = 0; i < 26; ++i) { | |
base64abc.push(String.fromCharCode(a + i)); | |
} | |
for (let i = 0; i < 10; ++i) { | |
base64abc.push(String.fromCharCode(n + i)); | |
} | |
base64abc.push("+"); | |
base64abc.push("/"); | |
*/ | |
const base64abc = [ | |
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", | |
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", | |
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", | |
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "/" | |
]; | |
/* | |
// This constant can also be computed with the following algorithm: | |
const l = 256, base64codes = new Uint8Array(l); | |
for (let i = 0; i < l; ++i) { | |
base64codes[i] = 255; // invalid character | |
} | |
base64abc.forEach((char, index) => { | |
base64codes[char.charCodeAt(0)] = index; | |
}); | |
base64codes["=".charCodeAt(0)] = 0; // ignored anyway, so we just need to prevent an error | |
*/ | |
const base64codes = [ | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, | |
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 0, 255, 255, | |
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, | |
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, | |
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | |
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 | |
]; | |
function getBase64Code(charCode) { | |
if (charCode >= base64codes.length) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
const code = base64codes[charCode]; | |
if (code === 255) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
return code; | |
} | |
export function bytesToBase64(bytes) { | |
let result = '', i, l = bytes.length; | |
for (i = 2; i < l; i += 3) { | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[((bytes[i - 2] & 0x03) << 4) | (bytes[i - 1] >> 4)]; | |
result += base64abc[((bytes[i - 1] & 0x0F) << 2) | (bytes[i] >> 6)]; | |
result += base64abc[bytes[i] & 0x3F]; | |
} | |
if (i === l + 1) { // 1 octet yet to write | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[(bytes[i - 2] & 0x03) << 4]; | |
result += "=="; | |
} | |
if (i === l) { // 2 octets yet to write | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[((bytes[i - 2] & 0x03) << 4) | (bytes[i - 1] >> 4)]; | |
result += base64abc[(bytes[i - 1] & 0x0F) << 2]; | |
result += "="; | |
} | |
return result; | |
} | |
export function base64ToBytes(str) { | |
if (str.length % 4 !== 0) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
const index = str.indexOf("="); | |
if (index !== -1 && index < str.length - 2) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
let missingOctets = str.endsWith("==") ? 2 : str.endsWith("=") ? 1 : 0, | |
n = str.length, | |
result = new Uint8Array(3 * (n / 4)), | |
buffer; | |
for (let i = 0, j = 0; i < n; i += 4, j += 3) { | |
buffer = | |
getBase64Code(str.charCodeAt(i)) << 18 | | |
getBase64Code(str.charCodeAt(i + 1)) << 12 | | |
getBase64Code(str.charCodeAt(i + 2)) << 6 | | |
getBase64Code(str.charCodeAt(i + 3)); | |
result[j] = buffer >> 16; | |
result[j + 1] = (buffer >> 8) & 0xFF; | |
result[j + 2] = buffer & 0xFF; | |
} | |
return result.subarray(0, result.length - missingOctets); | |
} | |
export function base64encode(str, encoder = new TextEncoder()) { | |
return bytesToBase64(encoder.encode(str)); | |
} | |
export function base64decode(str, decoder = new TextDecoder()) { | |
return decoder.decode(base64ToBytes(str)); | |
} |
@jordanbtucker great explanation
Which one is correct? Well, they both are.
I would argue that both btoa
and base64encode
are incorrect (and confusing).
btoa
is supposed to stand for "binary to ASCII", but btoa
actually takes a string (and not binary / bytes) as an input. So, it is not actually btoa
, but more like stoa
, with a hidden mechanism for converting each character of the input string to binary - this char to byte mechanism is not clear from the function signature, and therefore causes confusion.
Your implementation of base64encode
suffers from the same problem.
I think it is better not to create string -> string "base64" functions - base64 encoding / decoding is supposed to be to / from an array / stream of bytes to / from an array / stream of ASCII characters. In other words, it is great that you have created bytesToBase64
and base64ToBytes
- in my humble opinion, I think you should not publish the base64encode
and base64decode
functions - these latter functions just make things confusing.
@vijtheveg I agree that the name btoa
is far from intuitive. I would never expect a function with "to ASCII" in its name to return a string. ASCII is an encoding for converting text into bytes, so I would expect a "to ASCII" function to return bytes, not a string. Of course, once you realize that "ASCII" in this context really means "Base64", it makes sense why it returns a string, since Base64 is meant to encode binary data as text, but naming it btoa
(binary to ASCII) is definitely confusing.
However, stoa
is an even less intuitive name for what it actually does. Typically when we think of a string, we think of something that holds text. So, if I have a function essentially called "string to ASCII", I would assume that it would take a string as text that contains only ASCII supported characters and encode it into ASCII, returning an array of bytes. Again, I would not expect it to encode anything to Base64 based on its name.
Even if btoa
was renamed to "string to Base64", it would still be confusing considering what it really does. I would expect it to take a string as text and encode it with a scheme like UTF-8 and then encode that as Base64. Which is exactly what this gist's base64encode
does.
What I'm getting at is that the B in btoa
is what tips you off to the fact that the string it takes as input is not text. It's a "binary string", meaning that it must only contain characters with code units in the range of 0-255. Probably the best name for it would be binaryStringToBase64
.
I also agree that this gist's base64encode
is not a good name for what it does. In my opinion, it should be called textToBase64
since that's its purpose, and its documentation should mention that it first encodes the text as UTF-8 before encoding it to Base64.
I think it's important to understand the differences and limitations of this implementation vs
btoa
andatob
.The first comment gives an example that
btoa
cannot handle.However, some may be misled to believe that this is a superior implementation or a drop-in replacement for
btoa
/atob
. In reality, they are just two different implementations of Base64 encoding/decoding with different features and limitations.The NPM page for this implementation states:
However, this isn't entirely true as explained below.
Input String Handling
The first main difference is how these two implementations handle input strings. Given the same string, each implementation may encode it differently.
Which one is correct? Well, they both are. The difference is that
base64encode
treats strings as text and encodes them to UTF-8 before encoding that UTF-8 to Base64. However,btoa
expects a binary string, which is a sequence of UTF-16 code units in the range of0x00
and0xFF
inclusive, essentially bytes, which it then encodes to Base64. The string"aeiou"
as a binary string should really be thought of as"\x61\x65\x69\x6F\x75"
rather than letters.Take the charcter
a
for example. It has the Unicode code point value0x0061
. Since this value is within the byte range,btoa
will treat it as the byte0x61
. In UTF-8,a
is also represented as the single byte0x61
, so bothbtoa
andbase64encode
encode it the same. Note that you could also represent the string"a"
as"\x61"
to illustrate this point.However, the character
ä
has the Unicode code point value0x00E4
. Since this value is within the byte range,btoa
will treat it as the byte0xE4
. However, in UTF-8ä
is represented as two bytes0xC3
,0xA4
, sobase64encode
encodes those two bytes instead. Even though the identical strings"ä"
and"\xE4"
are equivalent for the purpose ofbtoa
, they should not be thought of as the same when usingbase64encode
.This doesn't mean that
base64encode
is incapable of encoding the byte0xE4
. It just means that you can't pass it the string"\xE4"
and expect it to encode the byte0xE4
to Base64. Instead, you have to usebytesToBase64
and pass it the array[0xE4]
or an equivalentUint8Array
.Since
atob
(the reverse ofbtoa
) returns a binary string, you can't reliably pass that string tobase64encode
.None of this is to say that either
btoa
orbase64encode
is more correct. You just have to know what strings they each expect to receive. Binary in the case ofbtoa
and text in the case ofbase64encode
.The critical takeaway is not that
btoa
can only handle ASCII characters (technically it can handle ASCII and Latin-1 Supplement characters). It's thatbtoa
does not expect you to give it text. It expects a string of bytes.Base64 Decoding
The second main difference is that
atob
implements a "forgiving-base64 decode" algorithm whilebase64decode
andbase64ToBytes
do not.The forgiving-base64 decode algorithm:
/[\t\n\f\r ]/
).=
characters.Alternatives
You don't need a library to handle Base64 encoding/decoding. You can use
btoa
andatob
. Here are a couple of short alternatives that usebtoa
andatob
under the hood. And as a bonus, you get the forgiving-base64 decode algorithm.You can find a TypeScript version of this alternative including base64url support in both JS and TS in this gist.