-
-
Save enepomnyaschih/72c423f727d395eeaa09697058238727 to your computer and use it in GitHub Desktop.
/* | |
MIT License | |
Copyright (c) 2020 Egor Nepomnyaschih | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
*/ | |
/* | |
// This constant can also be computed with the following algorithm: | |
const base64abc = [], | |
A = "A".charCodeAt(0), | |
a = "a".charCodeAt(0), | |
n = "0".charCodeAt(0); | |
for (let i = 0; i < 26; ++i) { | |
base64abc.push(String.fromCharCode(A + i)); | |
} | |
for (let i = 0; i < 26; ++i) { | |
base64abc.push(String.fromCharCode(a + i)); | |
} | |
for (let i = 0; i < 10; ++i) { | |
base64abc.push(String.fromCharCode(n + i)); | |
} | |
base64abc.push("+"); | |
base64abc.push("/"); | |
*/ | |
const base64abc = [ | |
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", | |
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", | |
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", | |
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "/" | |
]; | |
/* | |
// This constant can also be computed with the following algorithm: | |
const l = 256, base64codes = new Uint8Array(l); | |
for (let i = 0; i < l; ++i) { | |
base64codes[i] = 255; // invalid character | |
} | |
base64abc.forEach((char, index) => { | |
base64codes[char.charCodeAt(0)] = index; | |
}); | |
base64codes["=".charCodeAt(0)] = 0; // ignored anyway, so we just need to prevent an error | |
*/ | |
const base64codes = [ | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, | |
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 0, 255, 255, | |
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, | |
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, | |
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | |
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 | |
]; | |
function getBase64Code(charCode) { | |
if (charCode >= base64codes.length) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
const code = base64codes[charCode]; | |
if (code === 255) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
return code; | |
} | |
export function bytesToBase64(bytes) { | |
let result = '', i, l = bytes.length; | |
for (i = 2; i < l; i += 3) { | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[((bytes[i - 2] & 0x03) << 4) | (bytes[i - 1] >> 4)]; | |
result += base64abc[((bytes[i - 1] & 0x0F) << 2) | (bytes[i] >> 6)]; | |
result += base64abc[bytes[i] & 0x3F]; | |
} | |
if (i === l + 1) { // 1 octet yet to write | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[(bytes[i - 2] & 0x03) << 4]; | |
result += "=="; | |
} | |
if (i === l) { // 2 octets yet to write | |
result += base64abc[bytes[i - 2] >> 2]; | |
result += base64abc[((bytes[i - 2] & 0x03) << 4) | (bytes[i - 1] >> 4)]; | |
result += base64abc[(bytes[i - 1] & 0x0F) << 2]; | |
result += "="; | |
} | |
return result; | |
} | |
export function base64ToBytes(str) { | |
if (str.length % 4 !== 0) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
const index = str.indexOf("="); | |
if (index !== -1 && index < str.length - 2) { | |
throw new Error("Unable to parse base64 string."); | |
} | |
let missingOctets = str.endsWith("==") ? 2 : str.endsWith("=") ? 1 : 0, | |
n = str.length, | |
result = new Uint8Array(3 * (n / 4)), | |
buffer; | |
for (let i = 0, j = 0; i < n; i += 4, j += 3) { | |
buffer = | |
getBase64Code(str.charCodeAt(i)) << 18 | | |
getBase64Code(str.charCodeAt(i + 1)) << 12 | | |
getBase64Code(str.charCodeAt(i + 2)) << 6 | | |
getBase64Code(str.charCodeAt(i + 3)); | |
result[j] = buffer >> 16; | |
result[j + 1] = (buffer >> 8) & 0xFF; | |
result[j + 2] = buffer & 0xFF; | |
} | |
return result.subarray(0, result.length - missingOctets); | |
} | |
export function base64encode(str, encoder = new TextEncoder()) { | |
return bytesToBase64(encoder.encode(str)); | |
} | |
export function base64decode(str, decoder = new TextDecoder()) { | |
return decoder.decode(base64ToBytes(str)); | |
} |
This really came in handy, thanks!
Awesome!
You saved me a lot of time, thanks!
base64abc should be compile-time const, dude. all sup
What about a reverse?
@davidcallanan - I've got it in TypeScript https://gist.github.com/enepomnyaschih/54c437997f8202871278d0fdf68148ca. Will add it in JS shortly.
@fatso83 - Created a micro-lib https://www.npmjs.com/package/byte-base64
@andrey-zakharov - Changed base64abc to a constant.
@davidcallanan - Added base64ToBytes and base64decode.
Thank you all for your feedback!
This is simply the BEST solution. Very nice indeed. Thank you brother!
There is a small mistake
if (charCode > base64codes.length) {
should be:
if (charCode >= base64codes.length) {
Or else converting something with an "{" inside will not throw the corresponding "Unable to parse base64 string." error.
@peaBerberian - Thanks! Fixed.
This is awesome. Thank you very much for sharing.
If the goal is to create a data-url there's an easier approach
const reader = new FileReader();
reader.addEventListener("load", function () {
const dataUrl = reader.result
....
}, false);
reader.readAsDataURL(new Blob(data))
https://developer.mozilla.org/en-US/docs/Web/API/FileReader/readAsDataURL
Awesome!
Happy to find this.
So, I have this repository that I call alpha-copious. It is supposed to keep base code that gets placed in templates for generating a website. So, I was just putting in hashes and went looking for base64. This works.
I put it into a modules. Directory. And, I put in another string for the url version.
I just wrote some of these in C++. You can find both items at copious-world Might wrap those in an encsripten package.
CODINGS is where I am putting C++.
alpha-copious is the nascent packaging program. I suppose I will be stuck putting things into modules. Will have to work on that fairly soon.
I don't have any money for offering bounties. In fact, I am worried about my grocery bill at the moment. So, looking for help in all areas. But, I have a number of crypto.subtle wrappers that I will probably move into alpha-copious. Alpha-copious is named so just because that is where I am putting basic helper code that can be rolled into templates, which are then populated with stuff that appears on a web site. So, its where a lot of things start. And, many of those things might be accessible globally.
So, a hash is very basic. And, I like to call something like do_hash("this") and note .do_hash("oh i forgot what went here").
I imagine a web programmer selecting the modules they want in their template (say with a checkbox html page) and then those modules get generated. "export" and not "exports" because the first is standard. Also, trying to make a number of things accessible from the global context for the web pages - looking for a good balance. Trying to avoid things being too wrapped up. But, it is good to keep the junk in the modules and out of site so to speak.
Using Svelte at the moment. So, running webpack is OK. At least they output modules that can be lazy loaded from a database. While lots of packagers seem to be on the verge of compiling into rigid objects like binaries. They also have huge amounts of code, often. Although I think I saw that in one (webpack ?) that they have a simple small module, which should be about right.
nice
<3
Typescript URL-safe version:
/*
MIT License
Copyright (c) 2020 Egor Nepomnyaschih
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
// This constant can also be computed with the following algorithm:
const base64abc = [],
A = "A".charCodeAt(0),
a = "a".charCodeAt(0),
n = "0".charCodeAt(0);
for (let i = 0; i < 26; ++i) {
base64abc.push(String.fromCharCode(A + i));
}
for (let i = 0; i < 26; ++i) {
base64abc.push(String.fromCharCode(a + i));
}
for (let i = 0; i < 10; ++i) {
base64abc.push(String.fromCharCode(n + i));
}
base64abc.push("+");
base64abc.push("/");
*/
const base64abc = [
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z',
'a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
't',
'u',
'v',
'w',
'x',
'y',
'z',
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'_',
'-',
]
const base64codes = (() => {
const l = 256
const base64codes = new Uint8Array(l)
for (let i = 0; i < l; ++i) {
base64codes[i] = 255 // invalid character
}
base64abc.forEach((char, index) => {
base64codes[char.charCodeAt(0)] = index
})
base64codes['='.charCodeAt(0)] = 0 // ignored anyway, so we just need to prevent an error
return base64codes
})()
function getBase64Code(charCode: number) {
if (charCode >= base64codes.length) {
throw new Error('Unable to parse base64 string (code beyond length).')
}
const code = base64codes[charCode]!
if (code === 255) {
throw new Error('Unable to parse base64 string (invalid code).')
}
return code
}
export function bytesToBase64(bytes: Uint8Array) {
let result = '',
i,
l = bytes.length
for (i = 2; i < l; i += 3) {
result += base64abc[bytes[i - 2]! >> 2]
result += base64abc[((bytes[i - 2]! & 0x03) << 4) | (bytes[i - 1]! >> 4)]
result += base64abc[((bytes[i - 1]! & 0x0f) << 2) | (bytes[i]! >> 6)]
result += base64abc[bytes[i]! & 0x3f]
}
if (i === l + 1) {
// 1 octet yet to write
result += base64abc[bytes[i - 2]! >> 2]
result += base64abc[(bytes[i - 2]! & 0x03) << 4]
result += '=='
}
if (i === l) {
// 2 octets yet to write
result += base64abc[bytes[i - 2]! >> 2]
result += base64abc[((bytes[i - 2]! & 0x03) << 4) | (bytes[i - 1]! >> 4)]
result += base64abc[(bytes[i - 1]! & 0x0f) << 2]
result += '='
}
return result
}
export function base64ToBytes(str: string) {
if (str.length % 4 !== 0) {
throw new Error('Unable to parse base64 string (invalid length).')
}
const index = str.indexOf('=')
if (index !== -1 && index < str.length - 2) {
throw new Error('Unable to parse base64 string (octets).')
}
let missingOctets = str.endsWith('==') ? 2 : str.endsWith('=') ? 1 : 0,
n = str.length,
result = new Uint8Array(3 * (n / 4)),
buffer
for (let i = 0, j = 0; i < n; i += 4, j += 3) {
buffer =
(getBase64Code(str.charCodeAt(i)) << 18) |
(getBase64Code(str.charCodeAt(i + 1)) << 12) |
(getBase64Code(str.charCodeAt(i + 2)) << 6) |
getBase64Code(str.charCodeAt(i + 3))
result[j] = buffer >> 16
result[j + 1] = (buffer >> 8) & 0xff
result[j + 2] = buffer & 0xff
}
return result.subarray(0, result.length - missingOctets)
}
export function base64encode(str: string, encoder = new TextEncoder()) {
return bytesToBase64(encoder.encode(str))
}
export function base64decode(str: string, decoder = new TextDecoder()) {
return decoder.decode(base64ToBytes(str))
}
I think it's important to understand the differences and limitations of this implementation vs btoa
and atob
.
The first comment gives an example that btoa
cannot handle.
expect(base64encode("Man Ё𤭢")).toBe("TWFuINCB8KStog==");
// Simple btoa fails here.
However, some may be misled to believe that this is a superior implementation or a drop-in replacement for btoa
/atob
. In reality, they are just two different implementations of Base64 encoding/decoding with different features and limitations.
The NPM page for this implementation states:
In particular, atob and btoa only support ASCII strings.
However, this isn't entirely true as explained below.
Input String Handling
The first main difference is how these two implementations handle input strings. Given the same string, each implementation may encode it differently.
// Same encoding
btoa("aeiou"); // "YWVpb3U="
base64encode("aeiou"); // "YWVpb3U="
// Different encoding
btoa("äëïöü"); // "5Ovv9vw="
base64encode("äëïöü"); // "w6TDq8Ovw7bDvA=="
Which one is correct? Well, they both are. The difference is that base64encode
treats strings as text and encodes them to UTF-8 before encoding that UTF-8 to Base64. However, btoa
expects a binary string, which is a sequence of UTF-16 code units in the range of 0x00
and 0xFF
inclusive, essentially bytes, which it then encodes to Base64. The string "aeiou"
as a binary string should really be thought of as "\x61\x65\x69\x6F\x75"
rather than letters.
Take the charcter a
for example. It has the Unicode code point value 0x0061
. Since this value is within the byte range, btoa
will treat it as the byte 0x61
. In UTF-8, a
is also represented as the single byte 0x61
, so both btoa
and base64encode
encode it the same. Note that you could also represent the string "a"
as "\x61"
to illustrate this point.
However, the character ä
has the Unicode code point value 0x00E4
. Since this value is within the byte range, btoa
will treat it as the byte 0xE4
. However, in UTF-8 ä
is represented as two bytes 0xC3
, 0xA4
, so base64encode
encodes those two bytes instead. Even though the identical strings "ä"
and "\xE4"
are equivalent for the purpose of btoa
, they should not be thought of as the same when using base64encode
.
This doesn't mean that base64encode
is incapable of encoding the byte 0xE4
. It just means that you can't pass it the string "\xE4"
and expect it to encode the byte 0xE4
to Base64. Instead, you have to use bytesToBase64
and pass it the array [0xE4]
or an equivalent Uint8Array
.
Since atob
(the reverse of btoa
) returns a binary string, you can't reliably pass that string to base64encode
.
// Round trip
btoa(atob("5Ovv9vw=")); // "5Ovv9vw="
base64encode(atob("5Ovv9vw=")); // "w6TDq8Ovw7bDvA=="
None of this is to say that either btoa
or base64encode
is more correct. You just have to know what strings they each expect to receive. Binary in the case of btoa
and text in the case of base64encode
.
The critical takeaway is not that btoa
can only handle ASCII characters (technically it can handle ASCII and Latin-1 Supplement characters). It's that btoa
does not expect you to give it text. It expects a string of bytes.
Base64 Decoding
The second main difference is that atob
implements a "forgiving-base64 decode" algorithm while base64decode
and base64ToBytes
do not.
The forgiving-base64 decode algorithm:
- Allows ASCII whitespace anywhere in the input string (tab, line feed, form feed, carriage return, and space or
/[\t\n\f\r ]/
). - Does not require the input string to have padding
=
characters.
const input = `
5Ovv
9vw
`
atob(input); // "aeiou"
base64decode(input); // Throws "Unable to parse base64 string."
Alternatives
You don't need a library to handle Base64 encoding/decoding. You can use btoa
and atob
. Here are a couple of short alternatives that use btoa
and atob
under the hood. And as a bonus, you get the forgiving-base64 decode algorithm.
export function bytesToBase64(bytes) {
return btoa(String.fromCharCode(...bytes));
}
export function base64ToBytes(str) {
return Uint8Array.from(atob(str), (c) => c.charCodeAt(0));
}
export function base64encode(str, encoder = new TextEncoder()) {
return bytesToBase64(encoder.encode(str));
}
export function base64decode(str, decoder = new TextDecoder()) {
return decoder.decode(base64ToBytes(str));
}
You can find a TypeScript version of this alternative including base64url support in both JS and TS in this gist.
@jordanbtucker great explanation
Which one is correct? Well, they both are.
I would argue that both btoa
and base64encode
are incorrect (and confusing).
btoa
is supposed to stand for "binary to ASCII", but btoa
actually takes a string (and not binary / bytes) as an input. So, it is not actually btoa
, but more like stoa
, with a hidden mechanism for converting each character of the input string to binary - this char to byte mechanism is not clear from the function signature, and therefore causes confusion.
Your implementation of base64encode
suffers from the same problem.
I think it is better not to create string -> string "base64" functions - base64 encoding / decoding is supposed to be to / from an array / stream of bytes to / from an array / stream of ASCII characters. In other words, it is great that you have created bytesToBase64
and base64ToBytes
- in my humble opinion, I think you should not publish the base64encode
and base64decode
functions - these latter functions just make things confusing.
@vijtheveg I agree that the name btoa
is far from intuitive. I would never expect a function with "to ASCII" in its name to return a string. ASCII is an encoding for converting text into bytes, so I would expect a "to ASCII" function to return bytes, not a string. Of course, once you realize that "ASCII" in this context really means "Base64", it makes sense why it returns a string, since Base64 is meant to encode binary data as text, but naming it btoa
(binary to ASCII) is definitely confusing.
However, stoa
is an even less intuitive name for what it actually does. Typically when we think of a string, we think of something that holds text. So, if I have a function essentially called "string to ASCII", I would assume that it would take a string as text that contains only ASCII supported characters and encode it into ASCII, returning an array of bytes. Again, I would not expect it to encode anything to Base64 based on its name.
Even if btoa
was renamed to "string to Base64", it would still be confusing considering what it really does. I would expect it to take a string as text and encode it with a scheme like UTF-8 and then encode that as Base64. Which is exactly what this gist's base64encode
does.
What I'm getting at is that the B in btoa
is what tips you off to the fact that the string it takes as input is not text. It's a "binary string", meaning that it must only contain characters with code units in the range of 0-255. Probably the best name for it would be binaryStringToBase64
.
I also agree that this gist's base64encode
is not a good name for what it does. In my opinion, it should be called textToBase64
since that's its purpose, and its documentation should mention that it first encodes the text as UTF-8 before encoding it to Base64.
Push this as a micro-lib - or someone will 😃