revolunet/lzw_encoder.js

Andrews54757 · 2018-12-27T21:04:59Z

It's now 2019! Use a `Map()`!

Most modern browsers now support the Map() object. This means, that instead of using a plain JS object for the dict, one could use a Map. This would solve the problem described by @jayphelps without using the prefixes in @lerouxb 's solution.

function lzw_encode(s) {
    if (!s) return s;
    var dict = new Map(); // Use a Map!
    var data = (s + "").split("");
    var out = [];
    var currChar;
    var phrase = data[0];
    var code = 256;
    for (var i = 1; i < data.length; i++) {
        currChar = data[i];
        if (dict.has(phrase + currChar)) {
            phrase += currChar;
        } else {
            out.push(phrase.length > 1 ? dict.get(phrase) : phrase.charCodeAt(0));
            dict.set(phrase + currChar, code);
            code++;
            phrase = currChar;
        }
    }
    out.push(phrase.length > 1 ? dict.get(phrase) : phrase.charCodeAt(0));
    for (var i = 0; i < out.length; i++) {
        out[i] = String.fromCharCode(out[i]);
    }
    return out.join("");
}

function lzw_decode(s) {
    var dict = new Map(); // Use a Map!
    var data = (s + "").split("");
    var currChar = data[0];
    var oldPhrase = currChar;
    var out = [currChar];
    var code = 256;
    var phrase;
    for (var i = 1; i < data.length; i++) {
        var currCode = data[i].charCodeAt(0);
        if (currCode < 256) {
            phrase = data[i];
        } else {
            phrase = dict.has(currCode) ? dict.get(currCode) : (oldPhrase + currChar);
        }
        out.push(phrase);
        currChar = phrase.charAt(0);
        dict.set(code, oldPhrase + currChar);
        code++;
        oldPhrase = phrase;
    }
    return out.join("");
}

In addition, encoding the string as UTF8 should not be done with the escape and unescape functions anymore as they are now deprecated. Instead, use this solution described in MDN docs.

function encode_utf8(s) {
    return encodeURIComponent(s).replace(/%([0-9A-F]{2})/g,
        function toSolidBytes(match, p1) {
            return String.fromCharCode('0x' + p1);
        });
}

function decode_utf8(s) {
    return decodeURIComponent(s.split('').map(function(c) {
        return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
    }).join(''));
}

martingraham · 2019-12-09T15:46:56Z

Just a note because I happened to run across this. If your data consists of utf32 characters, start your dictionary index at 0xe000, that is:
var code = 57344;
This skips over the endpoint values from 0xd800 to 0xdfff which will cause illegal character exceptions in your encoder/decoder. Effectively, this increases your maximum dictionary size from about 55k to about 1M.

My data was UTF-16 and I had a similar problem. Firstly, once the code entries got above #10000 the encoding routine only stored the first 16 bits. So I switched Andrews54757 code above to use codePoint functions to return proper values (surrogate pairs) for codes above #10000. I then in turn had to get the incrementing code values in both routines to jump between $d800 and $e000 to avoid code values in that region that would get incorrectly interpreted as surrogate pairs.

// from https://gist.github.com/revolunet/843889

lzw_encode: function (s) {
      if (!s) return s;
      var dict = new Map(); // Use a Map!
      var data = (s + "").split("");
      var out = [];
      var currChar;
      var phrase = data[0];
      var code = 256;
      for (var i = 1; i < data.length; i++) {
          currChar = data[i];
          if (dict.has(phrase + currChar)) {
              phrase += currChar;
          } else {
              out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
              dict.set(phrase + currChar, code);
              code++;
              if (code === 0xd800) { code = 0xe000; }
              phrase = currChar;
          }
      }
      out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
      for (var i = 0; i < out.length; i++) {
          out[i] = String.fromCodePoint(out[i]);
      }
      //console.log ("LZW MAP SIZE", dict.size, out.slice (-50), out.length, out.join("").length);
      return out.join("");
  },

  lzw_decode: function (s) {
      var dict = new Map(); // Use a Map!
      var data = Array.from(s + "");
      //var data = (s + "").split("");
      var currChar = data[0];
      var oldPhrase = currChar;
      var out = [currChar];
      var code = 256;
      var phrase;
      for (var i = 1; i < data.length; i++) {
          var currCode = data[i].codePointAt(0);
          if (currCode < 256) {
              phrase = data[i];
          } else {
              phrase = dict.has(currCode) ? dict.get(currCode) : (oldPhrase + currChar);
          }
          out.push(phrase);
          var cp = phrase.codePointAt(0);
          currChar = String.fromCodePoint(cp); //phrase.charAt(0);
          dict.set(code, oldPhrase + currChar);
          code++;
          if (code === 0xd800) { code = 0xe000; }
          oldPhrase = phrase;
      }
      return out.join("");
  },

hrieke · 2020-06-10T13:51:14Z

Awesome code, under which license have you released this under?
Thanks & keep on rock'n.

revolunet · 2020-06-10T14:25:34Z

this code has been took from somewhere and i dont remember where or why... i'm not the author of this code

antonylesuisse · 2021-04-06T03:15:03Z

A version designed for long string storage in UTF-8 encoded files using a base64 encoding. Works with unicode (including surrogates).

https://github.com/antonylesuisse/lzwjs/blob/master/lzw.js

function lzw64_encode(s) {
    if (!s) return s;
    var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    var d=new Map();
    var s=unescape(encodeURIComponent(s)).split("");
    var word=s[0];
    var num=256;
    var key;
    var o=[];
    function out(word,num) {
        key=word.length>1 ? d.get(word) : word.charCodeAt(0);
        o.push(b64[key&0x3f]);
        o.push(b64[(key>>6)&0x3f]);
        o.push(b64[(key>>12)&0x3f]);
    }
    for (var i=1; i<s.length; i++) {
        var c=s[i];
        if (d.has(word+c)) {
            word+=c;
        } else {
            d.set(word+c,num++);
            out(word,num);
            word=c;
            if(num==(1<<18)-1) {
                d.clear();
                num=256;
            }
        }
    }
    out(word);
    return o.join("");
}

function lzw64_decode(s) {
    var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    var b64d={};
    for(var i=0; i<64; i++){
        b64d[b64.charAt(i)]=i;
    }
    var d=new Map();
    var num=256;
    var word=String.fromCharCode(b64d[s[0]]+(b64d[s[1]]<<6)+(b64d[s[2]]<<12));
    var prev=word;
    var o=[word];
    for(var i=3; i<s.length; i+=3) {
        var key=b64d[s[i]]+(b64d[s[i+1]]<<6)+(b64d[s[i+2]]<<12);
        word=key<256 ? String.fromCharCode(key) : d.has(key) ? d.get(key) : word+word.charAt(0);
        o.push(word);
        d.set(num++, prev+word.charAt(0));
        prev=word;
        if(num==(1<<18)-1) {
            d.clear();
            num=256;
        }
    }
    return decodeURIComponent(escape(o.join("")));
}

revolunet/lzw_encoder.js

Select an option

No results found

Select an option

No results found

Andrews54757 commented Dec 27, 2018 •

edited

Loading

Uh oh!

martingraham commented Dec 9, 2019 •

edited

Loading

Uh oh!

hrieke commented Jun 10, 2020

Uh oh!

revolunet commented Jun 10, 2020

Uh oh!

antonylesuisse commented Apr 6, 2021 •

edited

Loading

Uh oh!

	// LZW-compress a string
	function lzw_encode(s) {
	var dict = {};
	var data = (s + "").split("");
	var out = [];
	var currChar;
	var phrase = data[0];
	var code = 256;
	for (var i=1; i<data.length; i++) {
	currChar=data[i];
	if (dict[phrase + currChar] != null) {
	phrase += currChar;
	}
	else {
	out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
	dict[phrase + currChar] = code;
	code++;
	phrase=currChar;
	}
	}
	out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
	for (var i=0; i<out.length; i++) {
	out[i] = String.fromCharCode(out[i]);
	}
	return out.join("");
	}

	// Decompress an LZW-encoded string
	function lzw_decode(s) {
	var dict = {};
	var data = (s + "").split("");
	var currChar = data[0];
	var oldPhrase = currChar;
	var out = [currChar];
	var code = 256;
	var phrase;
	for (var i=1; i<data.length; i++) {
	var currCode = data[i].charCodeAt(0);
	if (currCode < 256) {
	phrase = data[i];
	}
	else {
	phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
	}
	out.push(phrase);
	currChar = phrase.charAt(0);
	dict[code] = oldPhrase + currChar;
	code++;
	oldPhrase = phrase;
	}
	return out.join("");
	}

revolunet/lzw_encoder.js

Andrews54757 commented Dec 27, 2018 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

It's now 2019! Use a Map()!

Uh oh!

martingraham commented Dec 9, 2019 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

hrieke commented Jun 10, 2020

Uh oh!

revolunet commented Jun 10, 2020

Uh oh!

antonylesuisse commented Apr 6, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Andrews54757 commented Dec 27, 2018 •

edited

Loading

It's now 2019! Use a `Map()`!

martingraham commented Dec 9, 2019 •

edited

Loading

antonylesuisse commented Apr 6, 2021 •

edited

Loading