Last active
August 10, 2023 07:49
-
-
Save klauspost/e18f2db63db60b82c4f4d6c7d2d1d856 to your computer and use it in GitHub Desktop.
Simple dict builder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/binary" | |
"flag" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"os" | |
"path/filepath" | |
"sort" | |
) | |
type match struct { | |
hash uint32 | |
n uint32 | |
offset int64 | |
} | |
type matchValue struct { | |
value []byte | |
followBy map[uint32]uint32 | |
preceededBy map[uint32]uint32 | |
} | |
var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size") | |
var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length") | |
var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index") | |
var wantOutput = flag.String("o", "dict-out.txt", "Output name") | |
func main() { | |
flag.Parse() | |
matches := make(map[uint32]uint32) | |
offsets := make(map[uint32]int64) | |
var total uint64 | |
base := flag.Arg(0) | |
if base == "" { | |
log.Fatal("no path with files specified") | |
} | |
wantLen := *wantLenFlag | |
hashBytes := *wantHashBytes | |
if hashBytes < 4 || hashBytes > 8 { | |
log.Fatal("-bytes must be >= 4 and <= 8") | |
} | |
maxBytes := *wantMaxBytes + 8 | |
found := make(map[uint32]struct{}) | |
// Index ALL hashes in all files. | |
filepath.Walk(base, func(path string, info os.FileInfo, err error) error { | |
if info.IsDir() { | |
return nil | |
} | |
b, err := ioutil.ReadFile(filepath.Join(base, info.Name())) | |
if err != nil { | |
log.Print(err) | |
return nil | |
} | |
if len(b) < 8 { | |
return nil | |
} | |
if len(b) > maxBytes { | |
b = b[:maxBytes] | |
} | |
for k := range found { | |
delete(found, k) | |
} | |
for i := range b { | |
rem := b[i:] | |
if len(rem) < 8 { | |
break | |
} | |
h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes)) | |
if _, ok := found[h]; ok { | |
// Only count first occurrence | |
continue | |
} | |
matches[h]++ | |
offsets[h] += int64(i) | |
total++ | |
found[h] = struct{}{} | |
} | |
fmt.Print("\r"+info.Name(), " Indexed...") | |
return nil | |
}) | |
threshold := uint32(total / uint64(len(matches))) | |
fmt.Println("total", total, "match", len(matches), "avg", threshold) | |
sorted := make([]match, 0, len(matches)/2) | |
for k, v := range matches { | |
if v <= threshold { | |
continue | |
} | |
sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]}) | |
} | |
sort.Slice(sorted, func(i, j int) bool { | |
if sorted[i].n == sorted[j].n { | |
return sorted[i].offset < sorted[j].offset | |
} | |
return sorted[i].n > sorted[j].n | |
}) | |
fmt.Println("Sorted len:", len(sorted)) | |
if len(sorted) > wantLen { | |
sorted = sorted[:wantLen] | |
} | |
fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n) | |
wantMatches := make(map[uint32]uint32, len(sorted)) | |
for _, v := range sorted { | |
wantMatches[v.hash] = v.n | |
} | |
output := make(map[uint32]matchValue, len(sorted)) | |
filepath.Walk(base, func(path string, info os.FileInfo, err error) error { | |
if info.IsDir() { | |
return nil | |
} | |
b, err := ioutil.ReadFile(filepath.Join(base, info.Name())) | |
if err != nil { | |
log.Print(err) | |
return nil | |
} | |
if len(b) < 8 { | |
return nil | |
} | |
if len(b) > maxBytes { | |
b = b[:maxBytes] | |
} | |
for i := range b { | |
rem := b[i:] | |
if len(rem) < 8 { | |
break | |
} | |
var prev []byte | |
if i > hashBytes { | |
prev = b[i-hashBytes:] | |
} | |
h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes)) | |
if _, ok := wantMatches[h]; !ok { | |
continue | |
} | |
mv := output[h] | |
if len(mv.value) == 0 { | |
var tmp = make([]byte, hashBytes) | |
copy(tmp[:], rem) | |
mv.value = tmp[:] | |
} | |
if mv.followBy == nil { | |
mv.followBy = make(map[uint32]uint32, 4) | |
mv.preceededBy = make(map[uint32]uint32, 4) | |
} | |
if len(rem) > hashBytes+8 { | |
// Check if we should add next as well. | |
hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes)) | |
if _, ok := wantMatches[hNext]; ok { | |
mv.followBy[hNext]++ | |
} | |
} | |
if len(prev) >= 8 { | |
// Check if we should prev next as well. | |
hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes)) | |
if _, ok := wantMatches[hPrev]; ok { | |
mv.preceededBy[hPrev]++ | |
} | |
} | |
output[h] = mv | |
} | |
fmt.Print("\r"+info.Name(), " Re-read...") | |
return nil | |
}) | |
dst := make([][]byte, 0, wantLen/hashBytes) | |
for i, e := range sorted { | |
m, ok := output[e.hash] | |
if !ok { | |
// Already added | |
continue | |
} | |
var tmp = make([]byte, 0, hashBytes*2) | |
{ | |
sortedPrev := make([]match, 0, len(m.followBy)) | |
for k, v := range m.preceededBy { | |
if _, ok := output[k]; !ok { | |
continue | |
} | |
sortedPrev = append(sortedPrev, match{ | |
hash: k, | |
n: v, | |
}) | |
} | |
if len(sortedPrev) > 0 { | |
sort.Slice(sortedPrev, func(i, j int) bool { | |
return sortedPrev[i].n > sortedPrev[j].n | |
}) | |
bestPrev := output[sortedPrev[0].hash] | |
tmp = append(tmp, bestPrev.value...) | |
} | |
} | |
tmp = append(tmp, m.value...) | |
delete(output, e.hash) | |
wantLen := e.n / uint32(hashBytes) / 2 | |
for { | |
sortedFollow := make([]match, 0, len(m.followBy)) | |
for k, v := range m.followBy { | |
if _, ok := output[k]; !ok { | |
continue | |
} | |
if v < wantLen { | |
// Not significant enough. | |
continue | |
} | |
sortedFollow = append(sortedFollow, match{ | |
hash: k, | |
n: v, | |
}) | |
} | |
if len(sortedFollow) == 0 { | |
break | |
} | |
sort.Slice(sortedFollow, func(i, j int) bool { | |
return sortedFollow[i].n > sortedFollow[j].n | |
}) | |
nh := sortedFollow[0].hash | |
m, ok = output[nh] | |
if !ok { | |
break | |
} | |
tmp = append(tmp, m.value...) | |
delete(output, nh) | |
} | |
if i < 100 { | |
fmt.Println("") | |
fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n) | |
} | |
// Delete substrings already added. | |
if len(tmp) > 8 { | |
for j := range tmp[:len(tmp)-hashBytes] { | |
var t8 [8]byte | |
copy(t8[:], tmp[j:]) | |
if i < 100 { | |
fmt.Println("") | |
fmt.Printf("DELETE %q", string(t8[:hashBytes])) | |
} | |
delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes))) | |
} | |
} | |
dst = append(dst, tmp) | |
} | |
o, err := os.Create(*wantOutput) | |
if err != nil { | |
panic(err) | |
} | |
defer o.Close() | |
written := 0 | |
for i, toWrite := range dst { | |
if len(toWrite)+written > wantLen { | |
toWrite = toWrite[:wantLen-written] | |
} | |
dst[i] = toWrite | |
written += len(toWrite) | |
if written >= wantLen { | |
dst = dst[:i+1] | |
break | |
} | |
} | |
// Write in reverse order. | |
for i := range dst { | |
toWrite := dst[len(dst)-i-1] | |
o.Write(toWrite) | |
} | |
} | |
const ( | |
prime3bytes = 506832829 | |
prime4bytes = 2654435761 | |
prime5bytes = 889523592379 | |
prime6bytes = 227718039650203 | |
prime7bytes = 58295818150454627 | |
prime8bytes = 0xcf1bbcdcb7a56463 | |
) | |
// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes. | |
// l must be >=4 and <=8. Any other value will return hash for 4 bytes. | |
// h should always be <32. | |
// Preferably h and l should be a constant. | |
// LENGTH 4 is passed straight through | |
func hashLen(u uint64, hashLog, mls uint8) uint32 { | |
switch mls { | |
case 5: | |
return hash5(u, hashLog) | |
case 6: | |
return hash6(u, hashLog) | |
case 7: | |
return hash7(u, hashLog) | |
case 8: | |
return hash8(u, hashLog) | |
default: | |
return uint32(u) | |
} | |
} | |
// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <32. | |
func hash3(u uint32, h uint8) uint32 { | |
return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31) | |
} | |
// hash4 returns the hash of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <32. | |
func hash4(u uint32, h uint8) uint32 { | |
return (u * prime4bytes) >> ((32 - h) & 31) | |
} | |
// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <32. | |
func hash4x64(u uint64, h uint8) uint32 { | |
return (uint32(u) * prime4bytes) >> ((32 - h) & 31) | |
} | |
// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <64. | |
func hash5(u uint64, h uint8) uint32 { | |
return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63)) | |
} | |
// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <64. | |
func hash6(u uint64, h uint8) uint32 { | |
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) | |
} | |
// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <64. | |
func hash7(u uint64, h uint8) uint32 { | |
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) | |
} | |
// hash8 returns the hash of u to fit in a hash table with h bits. | |
// Preferably h should be a constant and should always be <64. | |
func hash8(u uint64, h uint8) uint32 { | |
return uint32((u * prime8bytes) >> ((64 - h) & 63)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment