Skip to content

Instantly share code, notes, and snippets.

@creshal
Created August 1, 2025 07:52
Show Gist options
  • Save creshal/1cfb8d7342956068116c87ec2ef9b888 to your computer and use it in GitHub Desktop.
Save creshal/1cfb8d7342956068116c87ec2ef9b888 to your computer and use it in GitHub Desktop.
Recover corrupted xz files with missing end-of-stream markers
#! /usr/bin/env python3
# Work around unxz throwing "Unexpected end of input" errors and refusing to decompress anything
# This can't recover everything, but it'll try, byte by byte if necessary
import lzma
import pathlib
import sys
# Adjust this to your workload: Seeks are more expensive than small buffer copies,
# setting this too high does not help performance
INITIAL_BLOCK_SIZE=20
for infile in sys.argv[1:]:
outfile = pathlib.Path(infile[:-3] if infile.endswith(".xz") else infile+".raw")
print(f"Decompressing as much as possible of {infile} to {outfile}")
if outfile.exists():
print(f"Skipping {infile}, decompress file {outfile} already exists")
continue
out_fd = outfile.open("wb")
# Recourd the last successful read position, so we can retry from it with small block sizes when errors occur
last_seek = 0
# Optimistically start at 16 MiB reads
block_size = 1 << INITIAL_BLOCK_SIZE
errors = 0
with lzma.open(infile) as in_fd:
while True:
try:
out_fd.write(in_fd.read(block_size))
last_seek += block_size
errors = 0
print("\tRead", block_size, "bytes, reached position", last_seek)
except EOFError as e:
errors += 1
block_size = block_size // 2
in_fd.seek(last_seek)
if errors > INITIAL_BLOCK_SIZE or block_size < 1:
print ("\tError count exceeded or minimum block size reached, giving up")
break
print(f"\tIgnoring exception {type(e)} ({e}) and re-trying from {last_seek} with block size {block_size}")
print(f"{infile}: {last_seek} bytes written to {outfile}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment