klauspost · August 10, 2023 07:49
diff --git a/main.go b/main.go
 package main

 import (
 	"encoding/binary"
 	"flag"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
 	"path/filepath"
 	"sort"
 )

 type match struct {
 	hash   uint32
 	n      uint32
 	offset int64
 }

 type matchValue struct {
 	value       []byte
 	followBy    map[uint32]uint32
 	preceededBy map[uint32]uint32
 }

 var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size")
 var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length")
 var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index")
 var wantOutput = flag.String("o", "dict-out.txt", "Output name")

 func main() {

 	flag.Parse()
 	matches := make(map[uint32]uint32)
 	offsets := make(map[uint32]int64)
 	var total uint64
 	base := flag.Arg(0)
 	if base == "" {
 		log.Fatal("no path with files specified")
 	}

 	wantLen := *wantLenFlag
 	hashBytes := *wantHashBytes
 	if hashBytes < 4 || hashBytes > 8 {
 		log.Fatal("-bytes must be >= 4 and <= 8")
 	}
 	maxBytes := *wantMaxBytes + 8
 	found := make(map[uint32]struct{})
 	// Index ALL hashes in all files.
 	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
 		if info.IsDir() {
 			return nil
 		}

 		b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
 		if err != nil {
 			log.Print(err)
 			return nil
 		}
 		if len(b) < 8 {
 			return nil
 		}
 		if len(b) > maxBytes {
 			b = b[:maxBytes]
 		}
 		for k := range found {
 			delete(found, k)
 		}
 		for i := range b {
 			rem := b[i:]
 			if len(rem) < 8 {
 				break
 			}
 			h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
 			if _, ok := found[h]; ok {
 				// Only count first occurrence
 				continue
 			}
 			matches[h]++
 			offsets[h] += int64(i)
 			total++
 			found[h] = struct{}{}
 		}
 		fmt.Print("\r"+info.Name(), " Indexed...")
 		return nil
 	})
 	threshold := uint32(total / uint64(len(matches)))
 	fmt.Println("total", total, "match", len(matches), "avg", threshold)
 	sorted := make([]match, 0, len(matches)/2)
 	for k, v := range matches {
 		if v <= threshold {
 			continue
 		}
 		sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]})
 	}
 	sort.Slice(sorted, func(i, j int) bool {
 		if sorted[i].n == sorted[j].n {
 			return sorted[i].offset < sorted[j].offset
 		}
 		return sorted[i].n > sorted[j].n
 	})
 	fmt.Println("Sorted len:", len(sorted))
 	if len(sorted) > wantLen {
 		sorted = sorted[:wantLen]
 	}
 	fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n)

 	wantMatches := make(map[uint32]uint32, len(sorted))
 	for _, v := range sorted {
 		wantMatches[v.hash] = v.n
 	}

 	output := make(map[uint32]matchValue, len(sorted))
 	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
 		if info.IsDir() {
 			return nil
 		}
 		b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
 		if err != nil {
 			log.Print(err)
 			return nil
 		}

 		if len(b) < 8 {
 			return nil
 		}
 		if len(b) > maxBytes {
 			b = b[:maxBytes]
 		}

 		for i := range b {
 			rem := b[i:]
 			if len(rem) < 8 {
 				break
 			}
 			var prev []byte
 			if i > hashBytes {
 				prev = b[i-hashBytes:]
 			}

 			h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
 			if _, ok := wantMatches[h]; !ok {
 				continue
 			}
 			mv := output[h]
 			if len(mv.value) == 0 {
 				var tmp = make([]byte, hashBytes)
 				copy(tmp[:], rem)
 				mv.value = tmp[:]
 			}
 			if mv.followBy == nil {
 				mv.followBy = make(map[uint32]uint32, 4)
 				mv.preceededBy = make(map[uint32]uint32, 4)
 			}
 			if len(rem) > hashBytes+8 {
 				// Check if we should add next as well.
 				hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes))
 				if _, ok := wantMatches[hNext]; ok {
 					mv.followBy[hNext]++
 				}
 			}
 			if len(prev) >= 8 {
 				// Check if we should prev next as well.
 				hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes))
 				if _, ok := wantMatches[hPrev]; ok {
 					mv.preceededBy[hPrev]++
 				}
 			}
 			output[h] = mv
 		}
 		fmt.Print("\r"+info.Name(), " Re-read...")
 		return nil
 	})

 	dst := make([][]byte, 0, wantLen/hashBytes)
 	for i, e := range sorted {
 		m, ok := output[e.hash]
 		if !ok {
 			// Already added
 			continue
 		}
 		var tmp = make([]byte, 0, hashBytes*2)
 		{
 			sortedPrev := make([]match, 0, len(m.followBy))
 			for k, v := range m.preceededBy {
 				if _, ok := output[k]; !ok {
 					continue
 				}
 				sortedPrev = append(sortedPrev, match{
 					hash: k,
 					n:    v,
 				})
 			}
 			if len(sortedPrev) > 0 {
 				sort.Slice(sortedPrev, func(i, j int) bool {
 					return sortedPrev[i].n > sortedPrev[j].n
 				})
 				bestPrev := output[sortedPrev[0].hash]
 				tmp = append(tmp, bestPrev.value...)
 			}
 		}
 		tmp = append(tmp, m.value...)
 		delete(output, e.hash)
 		wantLen := e.n / uint32(hashBytes) / 2
 		for {
 			sortedFollow := make([]match, 0, len(m.followBy))
 			for k, v := range m.followBy {
 				if _, ok := output[k]; !ok {
 					continue
 				}
 				if v < wantLen {
 					// Not significant enough.
 					continue
 				}
 				sortedFollow = append(sortedFollow, match{
 					hash: k,
 					n:    v,
 				})
 			}
 			if len(sortedFollow) == 0 {
 				break
 			}
 			sort.Slice(sortedFollow, func(i, j int) bool {
 				return sortedFollow[i].n > sortedFollow[j].n
 			})
 			nh := sortedFollow[0].hash
 			m, ok = output[nh]
 			if !ok {
 				break
 			}
 			tmp = append(tmp, m.value...)
 			delete(output, nh)
 		}
 		if i < 100 {
 			fmt.Println("")
 			fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n)
 		}
 		// Delete substrings already added.
 		if len(tmp) > 8 {
 			for j := range tmp[:len(tmp)-hashBytes] {
 				var t8 [8]byte
 				copy(t8[:], tmp[j:])
 				if i < 100 {
 					fmt.Println("")
 					fmt.Printf("DELETE %q", string(t8[:hashBytes]))
 				}
 				delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes)))
 			}
 		}
 		dst = append(dst, tmp)
 	}
 	o, err := os.Create(*wantOutput)
 	if err != nil {
 		panic(err)
 	}
 	defer o.Close()
 	written := 0
 	for i, toWrite := range dst {
 		if len(toWrite)+written > wantLen {
 			toWrite = toWrite[:wantLen-written]
 		}
 		dst[i] = toWrite
 		written += len(toWrite)
 		if written >= wantLen {
 			dst = dst[:i+1]
 			break
 		}
 	}
 	// Write in reverse order.
 	for i := range dst {
 		toWrite := dst[len(dst)-i-1]
 		o.Write(toWrite)
 	}
 }

 const (
 	prime3bytes = 506832829
 	prime4bytes = 2654435761
 	prime5bytes = 889523592379
 	prime6bytes = 227718039650203
 	prime7bytes = 58295818150454627
 	prime8bytes = 0xcf1bbcdcb7a56463
 )

 // hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
 // l must be >=4 and <=8. Any other value will return hash for 4 bytes.
 // h should always be <32.
 // Preferably h and l should be a constant.
 // LENGTH 4 is passed straight through
 func hashLen(u uint64, hashLog, mls uint8) uint32 {
 	switch mls {
 	case 5:
 		return hash5(u, hashLog)
 	case 6:
 		return hash6(u, hashLog)
 	case 7:
 		return hash7(u, hashLog)
 	case 8:
 		return hash8(u, hashLog)
 	default:
 		return uint32(u)
 	}
 }

 // hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash3(u uint32, h uint8) uint32 {
 	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
 }

 // hash4 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4(u uint32, h uint8) uint32 {
 	return (u * prime4bytes) >> ((32 - h) & 31)
 }

 // hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4x64(u uint64, h uint8) uint32 {
 	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
 }

 // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash5(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
 }

 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash6(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
 }

 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
 }

 // hash8 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash8(u uint64, h uint8) uint32 {
 	return uint32((u * prime8bytes) >> ((64 - h) & 63))
 }
	package main

	import (
	"encoding/binary"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"sort"
	)

	type match struct {
	hash uint32
	n uint32
	offset int64
	}

	type matchValue struct {
	value []byte
	followBy map[uint32]uint32
	preceededBy map[uint32]uint32
	}

	var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size")
	var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length")
	var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index")
	var wantOutput = flag.String("o", "dict-out.txt", "Output name")

	func main() {

	flag.Parse()
	matches := make(map[uint32]uint32)
	offsets := make(map[uint32]int64)
	var total uint64
	base := flag.Arg(0)
	if base == "" {
	log.Fatal("no path with files specified")
	}

	wantLen := *wantLenFlag
	hashBytes := *wantHashBytes
	if hashBytes < 4 \|\| hashBytes > 8 {
	log.Fatal("-bytes must be >= 4 and <= 8")
	}
	maxBytes := *wantMaxBytes + 8
	found := make(map[uint32]struct{})
	// Index ALL hashes in all files.
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
	if info.IsDir() {
	return nil
	}

	b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
	if err != nil {
	log.Print(err)
	return nil
	}
	if len(b) < 8 {
	return nil
	}
	if len(b) > maxBytes {
	b = b[:maxBytes]
	}
	for k := range found {
	delete(found, k)
	}
	for i := range b {
	rem := b[i:]
	if len(rem) < 8 {
	break
	}
	h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
	if _, ok := found[h]; ok {
	// Only count first occurrence
	continue
	}
	matches[h]++
	offsets[h] += int64(i)
	total++
	found[h] = struct{}{}
	}
	fmt.Print("\r"+info.Name(), " Indexed...")
	return nil
	})
	threshold := uint32(total / uint64(len(matches)))
	fmt.Println("total", total, "match", len(matches), "avg", threshold)
	sorted := make([]match, 0, len(matches)/2)
	for k, v := range matches {
	if v <= threshold {
	continue
	}
	sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]})
	}
	sort.Slice(sorted, func(i, j int) bool {
	if sorted[i].n == sorted[j].n {
	return sorted[i].offset < sorted[j].offset
	}
	return sorted[i].n > sorted[j].n
	})
	fmt.Println("Sorted len:", len(sorted))
	if len(sorted) > wantLen {
	sorted = sorted[:wantLen]
	}
	fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n)

	wantMatches := make(map[uint32]uint32, len(sorted))
	for _, v := range sorted {
	wantMatches[v.hash] = v.n
	}

	output := make(map[uint32]matchValue, len(sorted))
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
	if info.IsDir() {
	return nil
	}
	b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
	if err != nil {
	log.Print(err)
	return nil
	}

	if len(b) < 8 {
	return nil
	}
	if len(b) > maxBytes {
	b = b[:maxBytes]
	}

	for i := range b {
	rem := b[i:]
	if len(rem) < 8 {
	break
	}
	var prev []byte
	if i > hashBytes {
	prev = b[i-hashBytes:]
	}

	h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
	if _, ok := wantMatches[h]; !ok {
	continue
	}
	mv := output[h]
	if len(mv.value) == 0 {
	var tmp = make([]byte, hashBytes)
	copy(tmp[:], rem)
	mv.value = tmp[:]
	}
	if mv.followBy == nil {
	mv.followBy = make(map[uint32]uint32, 4)
	mv.preceededBy = make(map[uint32]uint32, 4)
	}
	if len(rem) > hashBytes+8 {
	// Check if we should add next as well.
	hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes))
	if _, ok := wantMatches[hNext]; ok {
	mv.followBy[hNext]++
	}
	}
	if len(prev) >= 8 {
	// Check if we should prev next as well.
	hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes))
	if _, ok := wantMatches[hPrev]; ok {
	mv.preceededBy[hPrev]++
	}
	}
	output[h] = mv
	}
	fmt.Print("\r"+info.Name(), " Re-read...")
	return nil
	})

	dst := make([][]byte, 0, wantLen/hashBytes)
	for i, e := range sorted {
	m, ok := output[e.hash]
	if !ok {
	// Already added
	continue
	}
	var tmp = make([]byte, 0, hashBytes*2)
	{
	sortedPrev := make([]match, 0, len(m.followBy))
	for k, v := range m.preceededBy {
	if _, ok := output[k]; !ok {
	continue
	}
	sortedPrev = append(sortedPrev, match{
	hash: k,
	n: v,
	})
	}
	if len(sortedPrev) > 0 {
	sort.Slice(sortedPrev, func(i, j int) bool {
	return sortedPrev[i].n > sortedPrev[j].n
	})
	bestPrev := output[sortedPrev[0].hash]
	tmp = append(tmp, bestPrev.value...)
	}
	}
	tmp = append(tmp, m.value...)
	delete(output, e.hash)
	wantLen := e.n / uint32(hashBytes) / 2
	for {
	sortedFollow := make([]match, 0, len(m.followBy))
	for k, v := range m.followBy {
	if _, ok := output[k]; !ok {
	continue
	}
	if v < wantLen {
	// Not significant enough.
	continue
	}
	sortedFollow = append(sortedFollow, match{
	hash: k,
	n: v,
	})
	}
	if len(sortedFollow) == 0 {
	break
	}
	sort.Slice(sortedFollow, func(i, j int) bool {
	return sortedFollow[i].n > sortedFollow[j].n
	})
	nh := sortedFollow[0].hash
	m, ok = output[nh]
	if !ok {
	break
	}
	tmp = append(tmp, m.value...)
	delete(output, nh)
	}
	if i < 100 {
	fmt.Println("")
	fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n)
	}
	// Delete substrings already added.
	if len(tmp) > 8 {
	for j := range tmp[:len(tmp)-hashBytes] {
	var t8 [8]byte
	copy(t8[:], tmp[j:])
	if i < 100 {
	fmt.Println("")
	fmt.Printf("DELETE %q", string(t8[:hashBytes]))
	}
	delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes)))
	}
	}
	dst = append(dst, tmp)
	}
	o, err := os.Create(*wantOutput)
	if err != nil {
	panic(err)
	}
	defer o.Close()
	written := 0
	for i, toWrite := range dst {
	if len(toWrite)+written > wantLen {
	toWrite = toWrite[:wantLen-written]
	}
	dst[i] = toWrite
	written += len(toWrite)
	if written >= wantLen {
	dst = dst[:i+1]
	break
	}
	}
	// Write in reverse order.
	for i := range dst {
	toWrite := dst[len(dst)-i-1]
	o.Write(toWrite)
	}
	}

	const (
	prime3bytes = 506832829
	prime4bytes = 2654435761
	prime5bytes = 889523592379
	prime6bytes = 227718039650203
	prime7bytes = 58295818150454627
	prime8bytes = 0xcf1bbcdcb7a56463
	)

	// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
	// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
	// h should always be <32.
	// Preferably h and l should be a constant.
	// LENGTH 4 is passed straight through
	func hashLen(u uint64, hashLog, mls uint8) uint32 {
	switch mls {
	case 5:
	return hash5(u, hashLog)
	case 6:
	return hash6(u, hashLog)
	case 7:
	return hash7(u, hashLog)
	case 8:
	return hash8(u, hashLog)
	default:
	return uint32(u)
	}
	}

	// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash3(u uint32, h uint8) uint32 {
	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
	}

	// hash4 returns the hash of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash4(u uint32, h uint8) uint32 {
	return (u * prime4bytes) >> ((32 - h) & 31)
	}

	// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash4x64(u uint64, h uint8) uint32 {
	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
	}

	// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash5(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
	}

	// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash6(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
	}

	// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash7(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
	}

	// hash8 returns the hash of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash8(u uint64, h uint8) uint32 {
	return uint32((u * prime8bytes) >> ((64 - h) & 63))
	}
No results found