Last active
July 16, 2021 15:40
-
-
Save drewwiens/99e94c7f0eb599807a8c91ad6688b270 to your computer and use it in GitHub Desktop.
JavaScript String Compression of Filenames in a URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// JavaScript String Compression of Filenames in a URL | |
// | |
// Problem: Reduce length of URL caused by storing a list of long filenames in URL query parameter. | |
// | |
// Solution: Switch between a 5-bit alphabet of letters and 4-bit alphabet of numbers. Use a special | |
// character to switch between lowercase and uppercase ("shift"). Represent the resulting bit | |
// string as a URL-safe base62 encoded string. | |
// | |
// Results: From my test set of ~100 strings from real filenames on a project: | |
// avg compression ratio: 0.87 | |
// max compression ratio: 0.97 | |
// min compression ratio: 0.80 | |
// | |
// Key points: | |
// 1. Letters and numbers inside the filenames tend to be grouped together. | |
// 2. The only symbol chars are ".", "-", " ", and "_". | |
// 3. Unicode and UTF-8 chars are passed via a special escape character. | |
// 4. The decode function is TODO. | |
// | |
const _ = require("lodash"); | |
const baseX = require("base-x"); | |
/** | |
* Convert a number to a bit string e.g. "10101...", padding 0's at the front to | |
* make the resulting bit string length equal to len. | |
* | |
* @param {number} num Number to convert into a bit string. | |
* @param {number} len Desired length of resulting bit string. | |
*/ | |
function toBitString(num, len) { | |
const bitString = num.toString(2); | |
let padByte = len - bitString.length; | |
return "0".repeat(padByte) + bitString; | |
} | |
const letters = _.mapValues( | |
{ | |
switchToNums: 0, | |
shift: 1, | |
".": 2, | |
"-": 3, | |
_: 4, | |
a: 5, | |
b: 6, | |
c: 7, | |
d: 8, | |
e: 9, | |
f: 10, | |
g: 11, | |
h: 12, | |
i: 13, | |
j: 14, | |
k: 15, | |
l: 16, | |
m: 17, | |
n: 18, | |
o: 19, | |
p: 20, | |
q: 21, | |
r: 22, | |
s: 23, | |
t: 24, | |
u: 25, | |
v: 26, | |
w: 27, | |
x: 28, | |
y: 29, | |
z: 30, | |
" ": 31, | |
}, | |
(k) => toBitString(k, 5) // Use 5 bits per "letter" | |
); | |
const numbers = _.mapValues( | |
{ | |
0: 0, | |
1: 1, | |
2: 2, | |
3: 3, | |
4: 4, | |
5: 5, | |
6: 6, | |
7: 7, | |
8: 8, | |
9: 9, | |
".": 10, | |
"-": 11, | |
_: 12, | |
switchToLetters: 13, | |
" ": 14, | |
escapeNextChar: 15, // Allow Unicode & UTF-8 as a fall-back | |
}, | |
(k) => toBitString(k, 4) // Use 4 bits per "number" | |
); | |
const onlyLetters = new Set([ | |
"a", | |
"b", | |
"c", | |
"d", | |
"e", | |
"f", | |
"g", | |
"h", | |
"i", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"p", | |
"q", | |
"r", | |
"s", | |
"t", | |
"u", | |
"v", | |
"w", | |
"x", | |
"y", | |
"z", | |
]); | |
const letterChars = new Set(_.keys(letters).filter((l) => l.length === 1)); | |
const numberChars = new Set(_.keys(numbers).filter((n) => n.length === 1)); | |
// The alphabet for base62 as per the docs: https://www.npmjs.com/package/base-x | |
const base62 = baseX( | |
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
); | |
/** | |
* Convert a bit string e.g. "10101..." into base62 | |
* | |
* @param {string} bitString | |
* @returns base62 | |
*/ | |
function bitStringToBase62(bitString) { | |
let padByte = 8 - (bitString.length % 8), | |
encoded = []; | |
bitString = "0".repeat(padByte) + bitString; | |
for (let i = bitString.length - 1; i >= 0; i -= 8) { | |
encoded.unshift(parseInt(bitString.substr(i, 8), 2)); | |
} | |
const byteArray = new Uint8Array(encoded); | |
return base62.encode(byteArray); | |
} | |
/** | |
* Convert `input` using custom alphabet and encoding scheme. | |
* | |
* @param {string} input | |
* @returns base62 of the encoded string | |
*/ | |
function encode(input) { | |
let output = ""; | |
// Record initial casing: | |
const firstLetter = input | |
.split("") | |
.find((c) => onlyLetters.has(c.toLowerCase())); | |
let uppercase = !!firstLetter && firstLetter === firstLetter.toUpperCase(); | |
output += uppercase ? "1" : "0"; | |
// Record initial alphabet: | |
let letter = letterChars.has(input[0].toLowerCase()); | |
output += letter ? "1" : "0"; | |
for (let char of input) { | |
const charLower = char.toLowerCase(); | |
// Check if char needs to be escaped to Unicode or UTF-8: | |
if (!letterChars.has(charLower) && !numberChars.has(char)) { | |
if (letter) { | |
output += letters.switchToNums; | |
output += numbers.escapeNextChar; | |
const charCode = char.charCodeAt(0); | |
const isUnicode = charCode > 255; | |
output += isUnicode ? "1" : "0"; // Indicate Unicode (1) or UTF-8 (0) | |
output += toBitString(char.charCodeAt(0), isUnicode ? 16 : 8); | |
} | |
} | |
// Check if need to switch char set: | |
if (letter && !letterChars.has(charLower)) { | |
output += letters.switchToNums; | |
letter = false; | |
} else if (!letter && !numberChars.has(char)) { | |
output += numbers.switchToLetters; | |
letter = true; | |
} | |
// Check if need to switch casing: | |
if (letter) { | |
if (!uppercase && char !== char.toLowerCase()) { | |
output += letters.shift; | |
uppercase = true; | |
} else if (uppercase && char !== char.toUpperCase()) { | |
output += letters.shift; | |
uppercase = false; | |
} | |
} | |
// Encode the character: | |
output += (letter ? letters : numbers)[charLower]; | |
} | |
// Return the encoded string in URL-safe base62: | |
return bitStringToBase62(output); | |
} | |
const testStrs = [ /* Your test strings here */ ] | |
let avg = 0, | |
max = 0, | |
min = Infinity; | |
for (let testStr of testStrs) { | |
const encoded = encode(testStr); | |
console.warn(`"${testStr}"\t\t`, testStr.length); | |
console.warn(`"${encoded}"\t\t`, encoded.length); | |
const comprRatio = encoded.length / testStr.length; | |
max = Math.max(max, comprRatio); | |
min = Math.min(min, comprRatio); | |
avg += comprRatio / testStrs.length; | |
} | |
console.warn("================="); | |
console.warn("avg compression ratio:", avg); | |
console.warn("max compression ratio:", max); | |
console.warn("min compression ratio:", min); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment