u8sand · June 6, 2020 19:15
diff --git a/Important File Recovery.md b/Important File Recovery.md
diff --git a/lostfiles.py b/lostfiles.py
 ''' lostfiles.py

 This is part of a last resort method to recover files on a disk. It's only been tested with ext4.
 Though I use this myself to *recover* files, who knows if this script can cause data loss, I will not be held responsible, you've been warned.

 The idea is:
 1. you can open data dumps or disks in binary read mode
 2. you can seek through that huge file searching for a file by file name, ext4 seems to store files along with their names like so:
  file:///your_path/your_file.txt {"mtime":1587662271099,"ctime":1587567723331,"size":2397,"etag":"3513od2ee2fa","orphaned":false}
 3. If you provide `file:///your_path/your_file.txt`, we can attempt to locate this section, parse the metadata and the file right after it
 based on the 'size' reported and the EOF (last null byte at least as far away as the reported file size, in practice I've found the size to be small).
 Because of this, it works best for raw text files.. never tried it on things like images though it *might* work (just don't "view" it)

 Python isn't the fastest language for seeking through your hard drive so you may want to `strings` + `grep` prior to using this to find a close location in your hard disk.
 That being said, I've not benchmarked it, it does report its current speed. If you do choose to use this on your entire hard disk, make sure you make
 your buffer size pretty high 1MB ~ 1GB

 See python3 lostfiles.py --help or just read the script to see what parameters it accepts.
 Byte values can be written in human form (base10 kind of) i.e. --bs=64m skip=100g
 '''

 import re
 import sys
 import time
 import json
 import click

 def file_chunk_generator(fh, buf_size=4096):
  ''' Read a `file` in chunks of `buf_size`
  '''
  with fh as fr:
    while True:
      buf = fr.read(buf_size)
      if not buf:
        break
      yield buf

 def human_bytes(b):
  if b >= int(1e15):
    return f"{b/1e15:.3f}pb"
  elif b >= int(1e12):
    return f"{b/1e12:.3f}tb"
  elif b >= int(1e9):
    return f"{b/1e9:.3f}gb"
  elif b >= int(1e6):
    return f"{b/1e6:.3f}mb"
  elif b >= int(1e3):
    return f"{b/1e3:.3f}kb"
  else:
    return f"{b:.3f}b"

 import re
 human_byte_matcher = re.compile(r'^(\d+(\.\d+)?)(\w+)?$')
 def from_human_bytes(s):
  m = human_byte_matcher.match(s)
  num = float(m.group(1))
  spec = m.group(3) or ''
  if spec.lower() in {'pb', 'p'}:
    mul = 1e15
  elif spec.lower() in {'tb', 't'}:
    mul = 1e12
  elif spec.lower() in {'gb', 'g'}:
    mul = 1e9
  elif spec.lower() in {'mb', 'm'}:
    mul = 1e6
  elif spec.lower() in {'kb', 'k'}:
    mul = 1e3
  elif spec.lower() in {'b', ''}:
    mul = 1
  else:
    raise Exception(f"Unrecognized {spec}")
  return int(num * mul)

 @click.command()
 @click.option('--file', help='File to search for lost files')
 @click.option('--search', help='Unique file/directory file:// path to search for')
 @click.option('--skip', default='0', help='Skip ahead number of bytes')
 @click.option('--bs', default='4096', help='Buffer size to use when searching')
 @click.option('--mpr', default='1g', help='Multiple of buffer size to report progress, default to every GB (0 to disable)')
 def search(file, search, skip='0', bs='4096', mpr='1g'):
  # setup search
  matcher = re.compile(f"(?P<search>{search}) (?P<meta>{{.+}})".encode())
  # prepare args
  skip = from_human_bytes(skip) if type(skip) == str else skip
  bs = from_human_bytes(bs) if type(bs) == str else bs
  mpr = from_human_bytes(mpr) if type(mpr) == str else mpr
  # prepare file
  fr = open(file, 'rb', buffering=0)
  if skip:
    fr.seek(skip)
    offset = skip
  else:
    offset = 0
  it = iter(file_chunk_generator(fr, buf_size=bs // 2))
  # start searching
  last_chunk = b''
  input_time_wasted = 0
  start = time.time()
  try:
    while True:
      chunk = next(it)
      offset += bs // 2
      full_chunk = last_chunk + chunk
      if mpr and (offset - skip) % int(mpr) == 0:
        print(f"Currently searching at {offset} ({human_bytes(offset)}) avg {human_bytes(float(offset - skip) / (time.time() - start + input_time_wasted))} per second...")
      m = matcher.search(full_chunk)
      if m is None:
        del last_chunk
        last_chunk = chunk
        continue
      else:
        start_input = time.time()
        start, stop = m.span()
        print(f"Found {m.group('search')} {m.group('meta')} at {offset + start - (bs // 2)} parse (to view/save)? [Y/n] ", end='')
        yn = input().strip()
        if yn != 'n':
          meta = json.loads(m.group('meta'))
          size = meta['size']
          #
          remaining_chunk = full_chunk[stop+1:]
          eof = None
          while eof is None or eof < size:
            remaining_chunk += next(it)
            try:
              eof = remaining_chunk.index(b'\0') if eof is None else remaining_chunk.index(b'\0', eof)
            except ValueError:
              continue
          file_contents = remaining_chunk[:eof]
          print(f"File size meta: {size} ({human_bytes(size)}), eof: {eof} ({human_bytes(eof)}), view? [Y/n] ", end='')
          yn = input().strip()
          if yn != 'n':
            try:
              print(file_contents.decode())
            except UnicodeDecodeError:
              print(file_contents)
          #
          print('Save to file (leave blank to not save)? ', end='')
          fn = input().strip()
          if fn:
            open(fn, 'wb').write(file_contents)
          last_chunk = remaining_chunk[eof:]
        input_time_wasted += time.time() - start_input
  except StopIteration:
    pass
  except KeyboardInterrupt:
    print('Stopping...')
  #
  print(f"Done. avg {human_bytes(float(offset - skip) / (time.time() - start + input_time_wasted))} per second")

 if __name__ == '__main__':
    search()
	''' lostfiles.py

	This is part of a last resort method to recover files on a disk. It's only been tested with ext4.
	Though I use this myself to recover files, who knows if this script can cause data loss, I will not be held responsible, you've been warned.

	The idea is:
	1. you can open data dumps or disks in binary read mode
	2. you can seek through that huge file searching for a file by file name, ext4 seems to store files along with their names like so:
	file:///your_path/your_file.txt {"mtime":1587662271099,"ctime":1587567723331,"size":2397,"etag":"3513od2ee2fa","orphaned":false}
	3. If you provide `file:///your_path/your_file.txt`, we can attempt to locate this section, parse the metadata and the file right after it
	based on the 'size' reported and the EOF (last null byte at least as far away as the reported file size, in practice I've found the size to be small).
	Because of this, it works best for raw text files.. never tried it on things like images though it might work (just don't "view" it)

	Python isn't the fastest language for seeking through your hard drive so you may want to `strings` + `grep` prior to using this to find a close location in your hard disk.
	That being said, I've not benchmarked it, it does report its current speed. If you do choose to use this on your entire hard disk, make sure you make
	your buffer size pretty high 1MB ~ 1GB

	See python3 lostfiles.py --help or just read the script to see what parameters it accepts.
	Byte values can be written in human form (base10 kind of) i.e. --bs=64m skip=100g
	'''

	import re
	import sys
	import time
	import json
	import click

	def file_chunk_generator(fh, buf_size=4096):
	''' Read a `file` in chunks of `buf_size`
	'''
	with fh as fr:
	while True:
	buf = fr.read(buf_size)
	if not buf:
	break
	yield buf

	def human_bytes(b):
	if b >= int(1e15):
	return f"{b/1e15:.3f}pb"
	elif b >= int(1e12):
	return f"{b/1e12:.3f}tb"
	elif b >= int(1e9):
	return f"{b/1e9:.3f}gb"
	elif b >= int(1e6):
	return f"{b/1e6:.3f}mb"
	elif b >= int(1e3):
	return f"{b/1e3:.3f}kb"
	else:
	return f"{b:.3f}b"

	import re
	human_byte_matcher = re.compile(r'^(\d+(\.\d+)?)(\w+)?$')
	def from_human_bytes(s):
	m = human_byte_matcher.match(s)
	num = float(m.group(1))
	spec = m.group(3) or ''
	if spec.lower() in {'pb', 'p'}:
	mul = 1e15
	elif spec.lower() in {'tb', 't'}:
	mul = 1e12
	elif spec.lower() in {'gb', 'g'}:
	mul = 1e9
	elif spec.lower() in {'mb', 'm'}:
	mul = 1e6
	elif spec.lower() in {'kb', 'k'}:
	mul = 1e3
	elif spec.lower() in {'b', ''}:
	mul = 1
	else:
	raise Exception(f"Unrecognized {spec}")
	return int(num * mul)

	@click.command()
	@click.option('--file', help='File to search for lost files')
	@click.option('--search', help='Unique file/directory file:// path to search for')
	@click.option('--skip', default='0', help='Skip ahead number of bytes')
	@click.option('--bs', default='4096', help='Buffer size to use when searching')
	@click.option('--mpr', default='1g', help='Multiple of buffer size to report progress, default to every GB (0 to disable)')
	def search(file, search, skip='0', bs='4096', mpr='1g'):
	# setup search
	matcher = re.compile(f"(?P<search>{search}) (?P<meta>{{.+}})".encode())
	# prepare args
	skip = from_human_bytes(skip) if type(skip) == str else skip
	bs = from_human_bytes(bs) if type(bs) == str else bs
	mpr = from_human_bytes(mpr) if type(mpr) == str else mpr
	# prepare file
	fr = open(file, 'rb', buffering=0)
	if skip:
	fr.seek(skip)
	offset = skip
	else:
	offset = 0
	it = iter(file_chunk_generator(fr, buf_size=bs // 2))
	# start searching
	last_chunk = b''
	input_time_wasted = 0
	start = time.time()
	try:
	while True:
	chunk = next(it)
	offset += bs // 2
	full_chunk = last_chunk + chunk
	if mpr and (offset - skip) % int(mpr) == 0:
	print(f"Currently searching at {offset} ({human_bytes(offset)}) avg {human_bytes(float(offset - skip) / (time.time() - start + input_time_wasted))} per second...")
	m = matcher.search(full_chunk)
	if m is None:
	del last_chunk
	last_chunk = chunk
	continue
	else:
	start_input = time.time()
	start, stop = m.span()
	print(f"Found {m.group('search')} {m.group('meta')} at {offset + start - (bs // 2)} parse (to view/save)? [Y/n] ", end='')
	yn = input().strip()
	if yn != 'n':
	meta = json.loads(m.group('meta'))
	size = meta['size']
	#
	remaining_chunk = full_chunk[stop+1:]
	eof = None
	while eof is None or eof < size:
	remaining_chunk += next(it)
	try:
	eof = remaining_chunk.index(b'\0') if eof is None else remaining_chunk.index(b'\0', eof)
	except ValueError:
	continue
	file_contents = remaining_chunk[:eof]
	print(f"File size meta: {size} ({human_bytes(size)}), eof: {eof} ({human_bytes(eof)}), view? [Y/n] ", end='')
	yn = input().strip()
	if yn != 'n':
	try:
	print(file_contents.decode())
	except UnicodeDecodeError:
	print(file_contents)
	#
	print('Save to file (leave blank to not save)? ', end='')
	fn = input().strip()
	if fn:
	open(fn, 'wb').write(file_contents)
	last_chunk = remaining_chunk[eof:]
	input_time_wasted += time.time() - start_input
	except StopIteration:
	pass
	except KeyboardInterrupt:
	print('Stopping...')
	#
	print(f"Done. avg {human_bytes(float(offset - skip) / (time.time() - start + input_time_wasted))} per second")

	if __name__ == '__main__':
	search()