iwalton3 · September 6, 2021 01:38
diff --git a/readme.md b/readme.md
diff --git a/idx_blob.py b/idx_blob.py
 #!/usr/bin/env python3
 import os
 import os.path
 import sys
 import json
 import math

 print("Collecting files...")

 files = {}
 files_multiple = {}
 path = sys.argv[1]
 fpath = f"{path}.unpack"
 fpath_len = len(fpath)+1
 empty_key = bytes(1024)

 for root, _, subpaths in os.walk(fpath, topdown=False):
    for file in subpaths:
        full_path = os.path.join(root, file)
        fkey = full_path[fpath_len:]
        with open(full_path, 'rb') as fh:
            fdata = fh.read(1024)
            if len(fdata) < 1024:
                continue
            if fdata == empty_key:
                continue
            if fdata in files:
                if files[fdata] == '__multiple__':
                    files_multiple[fdata].add(fkey)
                else:
                    existing = files[fdata]
                    files_multiple[fdata] = {fkey}
                    files[fdata] = '__multiple__'
            else:
                files[fdata] = fkey

 i=0
 index = []
 matching_index = []
 prevent_messages = set()

 def copy_blocks(source, dest, count):
    for _ in range(count):
        dest.write(source.read(1024))

 with open(path, 'rb') as fh:
    print("Building index...")
    while True:
        fdata = fh.read(1024)
        if len(fdata) < 1024:
            break
        data_match = files.get(fdata)
        if data_match:
            if data_match == '__multiple__':
                for match in files_multiple[fdata]:
                    index.append((i, match))
                    if match not in prevent_messages:
                        prevent_messages.add(match)
                        print(f"Found possible match at {i} for {match} (multi)")
            else:
                index.append((i, data_match))
                if data_match not in prevent_messages:
                    prevent_messages.add(data_match)
                    print(f"Found possible match at {i} for {data_match}")
        i += 1
    del files

    print("Verifying...")
    for pos, file in index:
        fh.seek(pos*1024)
        with open(os.path.join(fpath, file), 'rb') as match:
            match_ok = True
            while True:
                fdata1 = fh.read(1024)
                fdata2 = match.read(1024)
                if len(fdata2) < 1024:
                    if len(fdata2) == 0:
                        break
                    leftovers = fdata1[len(fdata2):]
                    for c in leftovers:
                        if c != 0:
                            print(f"Found junk data at {pos} for {file} (ignoring)")
                            match_ok = False
                            break
                    fdata1 = fdata1[:len(fdata2)]
                if fdata1 != fdata2:
                    match_ok = False
                    break
            if match_ok:
                print(f"Verified match at {pos} for {file}")
                matching_index.append((pos, file))
    del index
    
    print("Verifying index...")
    good_index = []
    index_with_sizes = []
    blocks = 0
    blocks_saved = 0
    total_blocks = math.ceil(os.path.getsize(path)/1024)
    for pos, file in matching_index:
        if blocks > pos:
            print(f"Ignoring entry {file} due to overlap.")
            continue
        skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
        index_with_sizes.append((pos, file, skip))
        good_index.append((pos, file))
        blocks = pos + skip
        blocks_saved += skip
    savings = math.floor(blocks_saved/total_blocks*100)
    del matching_index
    print(f"Will save {savings} percent of filesize!")

    if len(good_index) == 0:
        print(f"FATAL: Index is empty!")
        sys.exit(1)

    print("Writing index...")
    with open(f"{path}.idx", 'w') as idx:
        json.dump(good_index, idx)

    print("Writing binary...")
    with open(f"{path}.bin", 'wb') as bin:
        blocks = 0
        fh.seek(0)
        for pos, file, skip in index_with_sizes:
            head_copy = pos - blocks
            copy_blocks(fh, bin, head_copy)
            blocks = pos + skip
            fh.seek(blocks*1024)
        tail_copy = total_blocks - blocks
        copy_blocks(fh, bin, tail_copy)
diff --git a/rebuild_blob.py b/rebuild_blob.py
 #!/usr/bin/env python3
 import os
 import os.path
 import sys
 import json
 import math

 path = sys.argv[1]
 write_path = sys.argv[2] if len(sys.argv) == 3 else path
 fpath = f"{path}.unpack"

 print("Reading index...")
 with open(f"{path}.idx", 'r') as fh:
    index = json.load(fh)

 print("Scanning files...")
 index_with_sizes = []
 for pos, file in index:
    skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
    index_with_sizes.append((pos, file, skip))

 def copy_blocks(source, dest, count):
    for _ in range(count):
        dest.write(source.read(1024))

 def copy_blocks_padded(source, dest, count):
    if count < 0:
        return
    for _ in range(count - 1):
        dest.write(source.read(1024))
    fdata = source.read(1024)
    dest.write(fdata)
    if len(fdata) < 1024:
        dest.write(bytes(1024 - len(fdata)))

 print("Rebuilding file...")
 total_blocks = math.ceil(os.path.getsize(f"{path}.bin")/1024)
 with open(write_path, 'wb') as fh:
    with open(f"{path}.bin", 'rb') as bin:
        blocks = 0
        blocks_copied = 0
        for pos, file, skip in index_with_sizes:
            head_copy = pos - blocks
            copy_blocks(bin, fh, head_copy)
            with open(os.path.join(fpath, file), 'rb') as fch:
                copy_blocks_padded(fch, fh, skip)
            blocks = pos + skip
            blocks_copied += head_copy
        tail_copy = total_blocks - blocks_copied
        copy_blocks(bin, fh, tail_copy)
	#!/usr/bin/env python3
	import os
	import os.path
	import sys
	import json
	import math

	print("Collecting files...")

	files = {}
	files_multiple = {}
	path = sys.argv[1]
	fpath = f"{path}.unpack"
	fpath_len = len(fpath)+1
	empty_key = bytes(1024)

	for root, _, subpaths in os.walk(fpath, topdown=False):
	for file in subpaths:
	full_path = os.path.join(root, file)
	fkey = full_path[fpath_len:]
	with open(full_path, 'rb') as fh:
	fdata = fh.read(1024)
	if len(fdata) < 1024:
	continue
	if fdata == empty_key:
	continue
	if fdata in files:
	if files[fdata] == '__multiple__':
	files_multiple[fdata].add(fkey)
	else:
	existing = files[fdata]
	files_multiple[fdata] = {fkey}
	files[fdata] = '__multiple__'
	else:
	files[fdata] = fkey

	i=0
	index = []
	matching_index = []
	prevent_messages = set()

	def copy_blocks(source, dest, count):
	for _ in range(count):
	dest.write(source.read(1024))

	with open(path, 'rb') as fh:
	print("Building index...")
	while True:
	fdata = fh.read(1024)
	if len(fdata) < 1024:
	break
	data_match = files.get(fdata)
	if data_match:
	if data_match == '__multiple__':
	for match in files_multiple[fdata]:
	index.append((i, match))
	if match not in prevent_messages:
	prevent_messages.add(match)
	print(f"Found possible match at {i} for {match} (multi)")
	else:
	index.append((i, data_match))
	if data_match not in prevent_messages:
	prevent_messages.add(data_match)
	print(f"Found possible match at {i} for {data_match}")
	i += 1
	del files

	print("Verifying...")
	for pos, file in index:
	fh.seek(pos*1024)
	with open(os.path.join(fpath, file), 'rb') as match:
	match_ok = True
	while True:
	fdata1 = fh.read(1024)
	fdata2 = match.read(1024)
	if len(fdata2) < 1024:
	if len(fdata2) == 0:
	break
	leftovers = fdata1[len(fdata2):]
	for c in leftovers:
	if c != 0:
	print(f"Found junk data at {pos} for {file} (ignoring)")
	match_ok = False
	break
	fdata1 = fdata1[:len(fdata2)]
	if fdata1 != fdata2:
	match_ok = False
	break
	if match_ok:
	print(f"Verified match at {pos} for {file}")
	matching_index.append((pos, file))
	del index

	print("Verifying index...")
	good_index = []
	index_with_sizes = []
	blocks = 0
	blocks_saved = 0
	total_blocks = math.ceil(os.path.getsize(path)/1024)
	for pos, file in matching_index:
	if blocks > pos:
	print(f"Ignoring entry {file} due to overlap.")
	continue
	skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
	index_with_sizes.append((pos, file, skip))
	good_index.append((pos, file))
	blocks = pos + skip
	blocks_saved += skip
	savings = math.floor(blocks_saved/total_blocks*100)
	del matching_index
	print(f"Will save {savings} percent of filesize!")

	if len(good_index) == 0:
	print(f"FATAL: Index is empty!")
	sys.exit(1)

	print("Writing index...")
	with open(f"{path}.idx", 'w') as idx:
	json.dump(good_index, idx)

	print("Writing binary...")
	with open(f"{path}.bin", 'wb') as bin:
	blocks = 0
	fh.seek(0)
	for pos, file, skip in index_with_sizes:
	head_copy = pos - blocks
	copy_blocks(fh, bin, head_copy)
	blocks = pos + skip
	fh.seek(blocks*1024)
	tail_copy = total_blocks - blocks
	copy_blocks(fh, bin, tail_copy)