Created
November 13, 2017 00:28
-
-
Save braska/fc45a300adc2466fae32c4b256f008a8 to your computer and use it in GitHub Desktop.
Huffman text encoder/decoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"flag" | |
"fmt" | |
"os" | |
"bufio" | |
"math" | |
"bytes" | |
"encoding/gob" | |
"github.com/olekukonko/tablewriter" | |
"github.com/dustin/go-humanize" | |
) | |
func usage() { | |
fmt.Fprintf(os.Stderr, "usage: %s [command] [inputfile] [outputfile]\n", os.Args[0]) | |
flag.PrintDefaults() | |
os.Exit(2) | |
} | |
func check(e error) { | |
if e != nil { | |
panic(e) | |
} | |
} | |
type EncodingMode int | |
const ( | |
MODE_ONE EncodingMode = iota | |
MODE_TWO EncodingMode = iota | |
) | |
type huffmanCode struct { | |
Code uint32 | |
CodeLen uint8 | |
} | |
func (hcode huffmanCode) asString() string { | |
var buffer bytes.Buffer | |
for i := hcode.CodeLen; i > 0; i-- { | |
n := hcode.Code & (1 << (i - 1)) | |
var t string | |
if n > 0 { | |
t = "1" | |
} else { | |
t = "0" | |
} | |
buffer.WriteString(t) | |
} | |
return buffer.String() | |
} | |
type huffmanCodes map[string]*huffmanCode | |
func findMinKeyInMap(m map[string]int) (k string) { | |
var minValue int | |
isFirstIter := true | |
for key, value := range m { | |
if isFirstIter { | |
minValue = value | |
k = key | |
isFirstIter = false | |
} | |
if value < minValue { | |
minValue = value | |
k = key | |
} | |
} | |
return | |
} | |
func encode(mode EncodingMode, inputf *os.File, outputf *os.File) { | |
table := tablewriter.NewWriter(os.Stdout) | |
countRunes := 0 | |
vertices := make(map[string]int) | |
combinations := make(map[string]int) | |
var prevRune string | |
scanner := bufio.NewScanner(inputf) | |
scanner.Split(bufio.ScanRunes) | |
for scanner.Scan() { | |
s := scanner.Text() | |
vertices[s] += 1 | |
countRunes += 1 | |
if prevRune != "" { | |
combination := prevRune + s | |
combinations[combination] += 1 | |
} | |
prevRune = s | |
} | |
countCombinations := countRunes - 1 | |
table.Append([]string{"Runes", fmt.Sprint(countRunes)}) | |
table.Append([]string{"Alphabet power", fmt.Sprint(len(vertices))}) | |
entropy := 0.0 | |
for _, value := range vertices { | |
p_i := float64(value) / float64(countRunes) | |
entropy -= p_i * math.Log2(p_i) | |
} | |
table.Append([]string{"Entropy", fmt.Sprint(entropy)}) | |
conditionalEntropy := 0.0 | |
for s1, count1 := range vertices { | |
p_i := float64(count1) / float64(countRunes) | |
for s2 := range vertices { | |
countForCombination, ok := combinations[s1 + s2] | |
if ok { | |
p_i_j := float64(countForCombination) / float64(countCombinations) | |
conditionalEntropy -= p_i_j * math.Log2(p_i_j / p_i) | |
} | |
} | |
} | |
table.Append([]string{"Conditional entropy", fmt.Sprint(conditionalEntropy)}) | |
tableOfCodes := tablewriter.NewWriter(os.Stdout) | |
codes := make(huffmanCodes) | |
countBitsForAllText := 0 | |
var huffmanTreeVertices map[string]int | |
if mode == MODE_ONE { | |
huffmanTreeVertices = vertices | |
} else { | |
huffmanTreeVertices = combinations | |
} | |
for len(huffmanTreeVertices) > 1 { | |
key1 := findMinKeyInMap(huffmanTreeVertices) | |
value1 := huffmanTreeVertices[key1] | |
delete(huffmanTreeVertices, key1) | |
key2 := findMinKeyInMap(huffmanTreeVertices) | |
value2 := huffmanTreeVertices[key2] | |
delete(huffmanTreeVertices, key2) | |
// if this is just leaf in huffman tree (single rune) | |
if ((mode == MODE_ONE && len([]rune(key1)) == 1) || (mode == MODE_TWO && len([]rune(key1)) == 2)) { | |
// key1 is a string from left branch => code=0 | |
codes[key1] = &huffmanCode{Code: 0, CodeLen: 1} | |
countBitsForAllText += value1 | |
} else { | |
if mode == MODE_ONE { | |
// for all runes from key1 add ZERO as a prefix for code | |
// adding zero as a prefix is the same as increment codeLen | |
for _, r := range key1 { | |
codes[string(r)].CodeLen += 1 | |
} | |
} else { | |
var prevSym string | |
for _, r := range key1 { | |
if prevSym != "" { | |
combination := prevSym + string(r) | |
codes[combination].CodeLen += 1 | |
prevSym = "" | |
} else { | |
prevSym = string(r) | |
} | |
} | |
} | |
} | |
if ((mode == MODE_ONE && len([]rune(key2)) == 1) || (mode == MODE_TWO && len([]rune(key2)) == 2)) { | |
codes[key2] = &huffmanCode{Code: 1, CodeLen: 1} | |
countBitsForAllText += value2 | |
} else { | |
if mode == MODE_ONE { | |
for _, r := range key2 { | |
k := string(r) | |
// set bit to 1 at specific position | |
codes[k].Code |= (1 << codes[k].CodeLen) | |
codes[k].CodeLen += 1 | |
} | |
} else { | |
var prevSym string | |
for _, r := range key2 { | |
if prevSym != "" { | |
combination := prevSym + string(r) | |
// set bit to 1 at specific position | |
codes[combination].Code |= (1 << codes[combination].CodeLen) | |
codes[combination].CodeLen += 1 | |
prevSym = "" | |
} else { | |
prevSym = string(r) | |
} | |
} | |
} | |
} | |
huffmanTreeVertices[key1 + key2] = value1 + value2 | |
} | |
enc := gob.NewEncoder(outputf) | |
err := enc.Encode(codes) | |
check(err) | |
err = enc.Encode(countBitsForAllText) | |
check(err) | |
for r, code := range codes { | |
tableOfCodes.Append([]string{string(r), fmt.Sprint(code.asString())}) | |
} | |
inputf.Seek(0, 0) | |
scanner2 := bufio.NewScanner(inputf) | |
scanner2.Split(bufio.ScanRunes) | |
var tmpByte byte | |
var bitsInTmpByte uint8 = 0 | |
var prevSym string | |
for scanner2.Scan() { | |
s := scanner2.Text() | |
if mode == MODE_TWO { | |
if prevSym == "" { | |
prevSym = s | |
continue | |
} | |
s = prevSym + s | |
prevSym = "" | |
} | |
hcode := *codes[s] | |
for hcode.CodeLen > 0 { | |
shift := int(hcode.CodeLen) - (8 - int(bitsInTmpByte)) | |
var bitsMustBeWritenToTmpByte uint8 = 0 | |
if shift >= 0 { | |
tmpByte += byte((hcode.Code >> uint(shift)) & 0xff) | |
bitsMustBeWritenToTmpByte = 8 - bitsInTmpByte | |
bitsInTmpByte = 8 | |
} else if shift < 0 { | |
tmpByte += byte((hcode.Code << uint(shift * -1)) & 0xff) | |
bitsMustBeWritenToTmpByte = hcode.CodeLen | |
bitsInTmpByte += hcode.CodeLen | |
} | |
if shift <= 0 { | |
hcode.CodeLen = 0 | |
} else { | |
hcode.CodeLen -= bitsMustBeWritenToTmpByte | |
} | |
if bitsInTmpByte == 8 { | |
_, err = outputf.Write([]byte{tmpByte}) | |
check(err) | |
bitsInTmpByte = 0 | |
tmpByte = 0 | |
} | |
} | |
} | |
if bitsInTmpByte > 0 { | |
_, err = outputf.Write([]byte{tmpByte}) | |
check(err) | |
bitsInTmpByte = 0 | |
tmpByte = 0 | |
} | |
outputf.Sync() | |
tableOfStats := tablewriter.NewWriter(os.Stdout) | |
inputfStat, istatErr := inputf.Stat() | |
check(istatErr) | |
outputfStat, ostatErr := outputf.Stat() | |
check(ostatErr) | |
inputfSize := inputfStat.Size() | |
outputfSize := outputfStat.Size() | |
tableOfStats.Append([]string{"Input file size", humanize.Bytes(uint64(inputfSize))}) | |
tableOfStats.Append([]string{"Output file size", humanize.Bytes(uint64(outputfSize))}) | |
tableOfStats.Append([]string{"Bits per rune in input file", fmt.Sprint(float64(inputfSize * 8) / float64(countRunes))}) | |
tableOfStats.Append([]string{"Bits per rune in output file", fmt.Sprint(float64(outputfSize * 8) / float64(countRunes))}) | |
table.Render() | |
tableOfCodes.Render() | |
tableOfStats.Render() | |
} | |
func decode(inputf *os.File, outputf *os.File) { | |
dec := gob.NewDecoder(inputf) | |
var codes = new(huffmanCodes) | |
var countBitsForAllText int | |
err := dec.Decode(codes) | |
check(err) | |
err = dec.Decode(&countBitsForAllText) | |
check(err) | |
fmt.Println(codes) | |
fmt.Println(countBitsForAllText) | |
} | |
func main() { | |
flag.Usage = usage | |
flag.Parse() | |
args := flag.Args() | |
if len(args) < 1 { | |
fmt.Println("Command is missing.") | |
os.Exit(1) | |
} | |
if len(args) < 2 { | |
fmt.Println("Input file is missing.") | |
os.Exit(1) | |
} | |
if len(args) < 3 { | |
fmt.Println("Output file is missing.") | |
os.Exit(1) | |
} | |
inputPath := os.Args[2] | |
outputPath := os.Args[3] | |
inputf, inputerr := os.Open(inputPath) | |
check(inputerr) | |
outputf, outputerr := os.Create(outputPath) | |
check(outputerr) | |
defer inputf.Close() | |
defer outputf.Close() | |
if os.Args[1] == "encode" { | |
encode(MODE_ONE, inputf, outputf) | |
} else if os.Args[1] == "encode2" { | |
encode(MODE_TWO, inputf, outputf) | |
} else if os.Args[1] == "decode" { | |
decode(inputf, outputf) | |
} else { | |
fmt.Fprintf(os.Stderr, "%s: '%s' is not a %s command.\n", os.Args[0], os.Args[1], os.Args[0]) | |
os.Exit(1) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment