drewwiens · July 16, 2021 15:40
diff --git a/compress-filename-in-url.js b/compress-filename-in-url.js
 // JavaScript String Compression of Filenames in a URL
 //
 // Problem: Reduce length of URL caused by storing a list of long filenames in URL query parameter.
 //
 // Solution: Switch between a 5-bit alphabet of letters and 4-bit alphabet of numbers. Use a special
 //   character to switch between lowercase and uppercase ("shift"). Represent the resulting bit
 //   string as a URL-safe base62 encoded string.
 //
 // Results: From my test set of ~100 strings from real filenames on a project:
 //   avg compression ratio: 0.87
 //   max compression ratio: 0.97
 //   min compression ratio: 0.80
 //
 // Key points:
 //   1. Letters and numbers inside the filenames tend to be grouped together.
 //   2. The only symbol chars are ".", "-", " ", and "_".
 //   3. Unicode and UTF-8 chars are passed via a special escape character.
 //   4. The decode function is TODO.
 //

 const _ = require("lodash");
 const baseX = require("base-x");

 /**
 * Convert a number to a bit string e.g. "10101...", padding 0's at the front to
 * make the resulting bit string length equal to len.
 *
 * @param {number} num Number to convert into a bit string.
 * @param {number} len Desired length of resulting bit string.
 */
 function toBitString(num, len) {
  const bitString = num.toString(2);
  let padByte = len - bitString.length;
  return "0".repeat(padByte) + bitString;
 }

 const letters = _.mapValues(
  {
    switchToNums: 0,
    shift: 1,
    ".": 2,
    "-": 3,
    _: 4,
    a: 5,
    b: 6,
    c: 7,
    d: 8,
    e: 9,
    f: 10,
    g: 11,
    h: 12,
    i: 13,
    j: 14,
    k: 15,
    l: 16,
    m: 17,
    n: 18,
    o: 19,
    p: 20,
    q: 21,
    r: 22,
    s: 23,
    t: 24,
    u: 25,
    v: 26,
    w: 27,
    x: 28,
    y: 29,
    z: 30,
    " ": 31,
  },
  (k) => toBitString(k, 5) // Use 5 bits per "letter"
 );

 const numbers = _.mapValues(
  {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
    7: 7,
    8: 8,
    9: 9,
    ".": 10,
    "-": 11,
    _: 12,
    switchToLetters: 13,
    " ": 14,
    escapeNextChar: 15, // Allow Unicode & UTF-8 as a fall-back
  },
  (k) => toBitString(k, 4) // Use 4 bits per "number"
 );

 const onlyLetters = new Set([
  "a",
  "b",
  "c",
  "d",
  "e",
  "f",
  "g",
  "h",
  "i",
  "j",
  "k",
  "l",
  "m",
  "n",
  "o",
  "p",
  "q",
  "r",
  "s",
  "t",
  "u",
  "v",
  "w",
  "x",
  "y",
  "z",
 ]);

 const letterChars = new Set(_.keys(letters).filter((l) => l.length === 1));
 const numberChars = new Set(_.keys(numbers).filter((n) => n.length === 1));

 // The alphabet for base62 as per the docs: https://www.npmjs.com/package/base-x
 const base62 = baseX(
  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 );

 /**
 * Convert a bit string e.g. "10101..." into base62
 *
 * @param {string} bitString
 * @returns base62
 */
 function bitStringToBase62(bitString) {
  let padByte = 8 - (bitString.length % 8),
    encoded = [];
  bitString = "0".repeat(padByte) + bitString;

  for (let i = bitString.length - 1; i >= 0; i -= 8) {
    encoded.unshift(parseInt(bitString.substr(i, 8), 2));
  }

  const byteArray = new Uint8Array(encoded);
  return base62.encode(byteArray);
 }

 /**
 * Convert `input` using custom alphabet and encoding scheme.
 *
 * @param {string} input
 * @returns base62 of the encoded string
 */
 function encode(input) {
  let output = "";

  // Record initial casing:
  const firstLetter = input
    .split("")
    .find((c) => onlyLetters.has(c.toLowerCase()));
  let uppercase = !!firstLetter && firstLetter === firstLetter.toUpperCase();
  output += uppercase ? "1" : "0";

  // Record initial alphabet:
  let letter = letterChars.has(input[0].toLowerCase());
  output += letter ? "1" : "0";

  for (let char of input) {
    const charLower = char.toLowerCase();

    // Check if char needs to be escaped to Unicode or UTF-8:
    if (!letterChars.has(charLower) && !numberChars.has(char)) {
      if (letter) {
        output += letters.switchToNums;
        output += numbers.escapeNextChar;
        const charCode = char.charCodeAt(0);
        const isUnicode = charCode > 255;
        output += isUnicode ? "1" : "0"; // Indicate Unicode (1) or UTF-8 (0)
        output += toBitString(char.charCodeAt(0), isUnicode ? 16 : 8);
      }
    }

    // Check if need to switch char set:
    if (letter && !letterChars.has(charLower)) {
      output += letters.switchToNums;
      letter = false;
    } else if (!letter && !numberChars.has(char)) {
      output += numbers.switchToLetters;
      letter = true;
    }

    // Check if need to switch casing:
    if (letter) {
      if (!uppercase && char !== char.toLowerCase()) {
        output += letters.shift;
        uppercase = true;
      } else if (uppercase && char !== char.toUpperCase()) {
        output += letters.shift;
        uppercase = false;
      }
    }

    // Encode the character:
    output += (letter ? letters : numbers)[charLower];
  }

  // Return the encoded string in URL-safe base62:
  return bitStringToBase62(output);
 }

 const testStrs = [ /* Your test strings here */ ]

 let avg = 0,
  max = 0,
  min = Infinity;
 for (let testStr of testStrs) {
  const encoded = encode(testStr);
  console.warn(`"${testStr}"\t\t`, testStr.length);
  console.warn(`"${encoded}"\t\t`, encoded.length);
  const comprRatio = encoded.length / testStr.length;
  max = Math.max(max, comprRatio);
  min = Math.min(min, comprRatio);
  avg += comprRatio / testStrs.length;
 }
 console.warn("=================");
 console.warn("avg compression ratio:", avg);
 console.warn("max compression ratio:", max);
 console.warn("min compression ratio:", min);
	// JavaScript String Compression of Filenames in a URL
	//
	// Problem: Reduce length of URL caused by storing a list of long filenames in URL query parameter.
	//
	// Solution: Switch between a 5-bit alphabet of letters and 4-bit alphabet of numbers. Use a special
	// character to switch between lowercase and uppercase ("shift"). Represent the resulting bit
	// string as a URL-safe base62 encoded string.
	//
	// Results: From my test set of ~100 strings from real filenames on a project:
	// avg compression ratio: 0.87
	// max compression ratio: 0.97
	// min compression ratio: 0.80
	//
	// Key points:
	// 1. Letters and numbers inside the filenames tend to be grouped together.
	// 2. The only symbol chars are ".", "-", " ", and "_".
	// 3. Unicode and UTF-8 chars are passed via a special escape character.
	// 4. The decode function is TODO.
	//

	const _ = require("lodash");
	const baseX = require("base-x");

	/**
	* Convert a number to a bit string e.g. "10101...", padding 0's at the front to
	* make the resulting bit string length equal to len.
	*
	* @param {number} num Number to convert into a bit string.
	* @param {number} len Desired length of resulting bit string.
	*/
	function toBitString(num, len) {
	const bitString = num.toString(2);
	let padByte = len - bitString.length;
	return "0".repeat(padByte) + bitString;
	}

	const letters = _.mapValues(
	{
	switchToNums: 0,
	shift: 1,
	".": 2,
	"-": 3,
	_: 4,
	a: 5,
	b: 6,
	c: 7,
	d: 8,
	e: 9,
	f: 10,
	g: 11,
	h: 12,
	i: 13,
	j: 14,
	k: 15,
	l: 16,
	m: 17,
	n: 18,
	o: 19,
	p: 20,
	q: 21,
	r: 22,
	s: 23,
	t: 24,
	u: 25,
	v: 26,
	w: 27,
	x: 28,
	y: 29,
	z: 30,
	" ": 31,
	},
	(k) => toBitString(k, 5) // Use 5 bits per "letter"
	);

	const numbers = _.mapValues(
	{
	0: 0,
	1: 1,
	2: 2,
	3: 3,
	4: 4,
	5: 5,
	6: 6,
	7: 7,
	8: 8,
	9: 9,
	".": 10,
	"-": 11,
	_: 12,
	switchToLetters: 13,
	" ": 14,
	escapeNextChar: 15, // Allow Unicode & UTF-8 as a fall-back
	},
	(k) => toBitString(k, 4) // Use 4 bits per "number"
	);

	const onlyLetters = new Set([
	"a",
	"b",
	"c",
	"d",
	"e",
	"f",
	"g",
	"h",
	"i",
	"j",
	"k",
	"l",
	"m",
	"n",
	"o",
	"p",
	"q",
	"r",
	"s",
	"t",
	"u",
	"v",
	"w",
	"x",
	"y",
	"z",
	]);

	const letterChars = new Set(_.keys(letters).filter((l) => l.length === 1));
	const numberChars = new Set(_.keys(numbers).filter((n) => n.length === 1));

	// The alphabet for base62 as per the docs: https://www.npmjs.com/package/base-x
	const base62 = baseX(
	"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
	);

	/**
	* Convert a bit string e.g. "10101..." into base62
	*
	* @param {string} bitString
	* @returns base62
	*/
	function bitStringToBase62(bitString) {
	let padByte = 8 - (bitString.length % 8),
	encoded = [];
	bitString = "0".repeat(padByte) + bitString;

	for (let i = bitString.length - 1; i >= 0; i -= 8) {
	encoded.unshift(parseInt(bitString.substr(i, 8), 2));
	}

	const byteArray = new Uint8Array(encoded);
	return base62.encode(byteArray);
	}

	/**
	* Convert `input` using custom alphabet and encoding scheme.
	*
	* @param {string} input
	* @returns base62 of the encoded string
	*/
	function encode(input) {
	let output = "";

	// Record initial casing:
	const firstLetter = input
	.split("")
	.find((c) => onlyLetters.has(c.toLowerCase()));
	let uppercase = !!firstLetter && firstLetter === firstLetter.toUpperCase();
	output += uppercase ? "1" : "0";

	// Record initial alphabet:
	let letter = letterChars.has(input[0].toLowerCase());
	output += letter ? "1" : "0";

	for (let char of input) {
	const charLower = char.toLowerCase();

	// Check if char needs to be escaped to Unicode or UTF-8:
	if (!letterChars.has(charLower) && !numberChars.has(char)) {
	if (letter) {
	output += letters.switchToNums;
	output += numbers.escapeNextChar;
	const charCode = char.charCodeAt(0);
	const isUnicode = charCode > 255;
	output += isUnicode ? "1" : "0"; // Indicate Unicode (1) or UTF-8 (0)
	output += toBitString(char.charCodeAt(0), isUnicode ? 16 : 8);
	}
	}

	// Check if need to switch char set:
	if (letter && !letterChars.has(charLower)) {
	output += letters.switchToNums;
	letter = false;
	} else if (!letter && !numberChars.has(char)) {
	output += numbers.switchToLetters;
	letter = true;
	}

	// Check if need to switch casing:
	if (letter) {
	if (!uppercase && char !== char.toLowerCase()) {
	output += letters.shift;
	uppercase = true;
	} else if (uppercase && char !== char.toUpperCase()) {
	output += letters.shift;
	uppercase = false;
	}
	}

	// Encode the character:
	output += (letter ? letters : numbers)[charLower];
	}

	// Return the encoded string in URL-safe base62:
	return bitStringToBase62(output);
	}

	const testStrs = [ /* Your test strings here */ ]

	let avg = 0,
	max = 0,
	min = Infinity;
	for (let testStr of testStrs) {
	const encoded = encode(testStr);
	console.warn(`"${testStr}"\t\t`, testStr.length);
	console.warn(`"${encoded}"\t\t`, encoded.length);
	const comprRatio = encoded.length / testStr.length;
	max = Math.max(max, comprRatio);
	min = Math.min(min, comprRatio);
	avg += comprRatio / testStrs.length;
	}
	console.warn("=================");
	console.warn("avg compression ratio:", avg);
	console.warn("max compression ratio:", max);
	console.warn("min compression ratio:", min);