|
#!/usr/bin/env python3 |
|
import os |
|
import os.path |
|
import sys |
|
import json |
|
import math |
|
|
|
print("Collecting files...") |
|
|
|
files = {} |
|
files_multiple = {} |
|
path = sys.argv[1] |
|
fpath = f"{path}.unpack" |
|
fpath_len = len(fpath)+1 |
|
empty_key = bytes(1024) |
|
|
|
for root, _, subpaths in os.walk(fpath, topdown=False): |
|
for file in subpaths: |
|
full_path = os.path.join(root, file) |
|
fkey = full_path[fpath_len:] |
|
with open(full_path, 'rb') as fh: |
|
fdata = fh.read(1024) |
|
if len(fdata) < 1024: |
|
continue |
|
if fdata == empty_key: |
|
continue |
|
if fdata in files: |
|
if files[fdata] == '__multiple__': |
|
files_multiple[fdata].add(fkey) |
|
else: |
|
existing = files[fdata] |
|
files_multiple[fdata] = {fkey} |
|
files[fdata] = '__multiple__' |
|
else: |
|
files[fdata] = fkey |
|
|
|
i=0 |
|
index = [] |
|
matching_index = [] |
|
prevent_messages = set() |
|
|
|
def copy_blocks(source, dest, count): |
|
for _ in range(count): |
|
dest.write(source.read(1024)) |
|
|
|
with open(path, 'rb') as fh: |
|
print("Building index...") |
|
while True: |
|
fdata = fh.read(1024) |
|
if len(fdata) < 1024: |
|
break |
|
data_match = files.get(fdata) |
|
if data_match: |
|
if data_match == '__multiple__': |
|
for match in files_multiple[fdata]: |
|
index.append((i, match)) |
|
if match not in prevent_messages: |
|
prevent_messages.add(match) |
|
print(f"Found possible match at {i} for {match} (multi)") |
|
else: |
|
index.append((i, data_match)) |
|
if data_match not in prevent_messages: |
|
prevent_messages.add(data_match) |
|
print(f"Found possible match at {i} for {data_match}") |
|
i += 1 |
|
del files |
|
|
|
print("Verifying...") |
|
for pos, file in index: |
|
fh.seek(pos*1024) |
|
with open(os.path.join(fpath, file), 'rb') as match: |
|
match_ok = True |
|
while True: |
|
fdata1 = fh.read(1024) |
|
fdata2 = match.read(1024) |
|
if len(fdata2) < 1024: |
|
if len(fdata2) == 0: |
|
break |
|
leftovers = fdata1[len(fdata2):] |
|
for c in leftovers: |
|
if c != 0: |
|
print(f"Found junk data at {pos} for {file} (ignoring)") |
|
match_ok = False |
|
break |
|
fdata1 = fdata1[:len(fdata2)] |
|
if fdata1 != fdata2: |
|
match_ok = False |
|
break |
|
if match_ok: |
|
print(f"Verified match at {pos} for {file}") |
|
matching_index.append((pos, file)) |
|
del index |
|
|
|
print("Verifying index...") |
|
good_index = [] |
|
index_with_sizes = [] |
|
blocks = 0 |
|
blocks_saved = 0 |
|
total_blocks = math.ceil(os.path.getsize(path)/1024) |
|
for pos, file in matching_index: |
|
if blocks > pos: |
|
print(f"Ignoring entry {file} due to overlap.") |
|
continue |
|
skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024) |
|
index_with_sizes.append((pos, file, skip)) |
|
good_index.append((pos, file)) |
|
blocks = pos + skip |
|
blocks_saved += skip |
|
savings = math.floor(blocks_saved/total_blocks*100) |
|
del matching_index |
|
print(f"Will save {savings} percent of filesize!") |
|
|
|
if len(good_index) == 0: |
|
print(f"FATAL: Index is empty!") |
|
sys.exit(1) |
|
|
|
print("Writing index...") |
|
with open(f"{path}.idx", 'w') as idx: |
|
json.dump(good_index, idx) |
|
|
|
print("Writing binary...") |
|
with open(f"{path}.bin", 'wb') as bin: |
|
blocks = 0 |
|
fh.seek(0) |
|
for pos, file, skip in index_with_sizes: |
|
head_copy = pos - blocks |
|
copy_blocks(fh, bin, head_copy) |
|
blocks = pos + skip |
|
fh.seek(blocks*1024) |
|
tail_copy = total_blocks - blocks |
|
copy_blocks(fh, bin, tail_copy) |