Created
August 1, 2025 07:52
-
-
Save creshal/1cfb8d7342956068116c87ec2ef9b888 to your computer and use it in GitHub Desktop.
Recover corrupted xz files with missing end-of-stream markers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# Work around unxz throwing "Unexpected end of input" errors and refusing to decompress anything | |
# This can't recover everything, but it'll try, byte by byte if necessary | |
import lzma | |
import pathlib | |
import sys | |
# Adjust this to your workload: Seeks are more expensive than small buffer copies, | |
# setting this too high does not help performance | |
INITIAL_BLOCK_SIZE=20 | |
for infile in sys.argv[1:]: | |
outfile = pathlib.Path(infile[:-3] if infile.endswith(".xz") else infile+".raw") | |
print(f"Decompressing as much as possible of {infile} to {outfile}") | |
if outfile.exists(): | |
print(f"Skipping {infile}, decompress file {outfile} already exists") | |
continue | |
out_fd = outfile.open("wb") | |
# Recourd the last successful read position, so we can retry from it with small block sizes when errors occur | |
last_seek = 0 | |
# Optimistically start at 16 MiB reads | |
block_size = 1 << INITIAL_BLOCK_SIZE | |
errors = 0 | |
with lzma.open(infile) as in_fd: | |
while True: | |
try: | |
out_fd.write(in_fd.read(block_size)) | |
last_seek += block_size | |
errors = 0 | |
print("\tRead", block_size, "bytes, reached position", last_seek) | |
except EOFError as e: | |
errors += 1 | |
block_size = block_size // 2 | |
in_fd.seek(last_seek) | |
if errors > INITIAL_BLOCK_SIZE or block_size < 1: | |
print ("\tError count exceeded or minimum block size reached, giving up") | |
break | |
print(f"\tIgnoring exception {type(e)} ({e}) and re-trying from {last_seek} with block size {block_size}") | |
print(f"{infile}: {last_seek} bytes written to {outfile}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment