mRB0 · March 19, 2016 18:43
diff --git a/ntfs-find-files-affected-by-ddrescue-bad-chunks.py b/ntfs-find-files-affected-by-ddrescue-bad-chunks.py
 #!/usr/bin/env python

 import sys
 import os
 import subprocess
 import re
 from StringIO import StringIO

 # usage:
 #
 # first use gnu ddrescue on with a log file
 # then update the following variables

 # block device of your ntfs filesystem (probably should use the target, on your good drive)
 device = "/dev/sde2"

 # logfile produced by ddrescue
 log = "work.log"

 # difference between your ddrescue logfile offsets and your partition start, in bytes
 #
 # 1. if you ran ddrescue on your ntfs filesystem partition, put 0 here
 #
 # 2. if you ran ddrescue on the whole drive, get your filesystem offset from fdisk:
 #
 #          fdisk /dev/sde
 #          u                   (change display units, repeat until you're in sectors)
 #          p                   (get the "start" for your filesystem device, and multiply it by the sector size in the top of the output; eg. 718848 * 512)
 log_adj = 368050176


 # that's it!

 def ph(message):
    stars = ''.join('*' * (len(message) + 2))

    print
    print stars
    print " {} ".format(message)
    print stars
    print


 ph("Read log")

 with open(log, 'rb') as lf:
    lines = list(lf)

 lines = [l for l in lines if not l.startswith('#')]
 bad_lines = [l for l in lines[1:] if l.strip().endswith('-')]
 matches = [re.match(r'0x([0-9A-F]+)[\t ]+0x([0-9A-F]+)', l.strip()) for l in bad_lines]
 bad_chunks = [(int(m.group(1), 16) - log_adj, int(m.group(2), 16)) for m in matches]

 print("Bad chunks:")
 for bc in bad_chunks:
    print "   offs {} size {}".format(bc[0], bc[1])
    
 ph("Read filesystem metadata")

 ntfsinfo_output = subprocess.check_output(['ntfsinfo', '-m', device])

 matches = re.search(r'^[\t ]*Cluster Size: ([0-9]+)$', ntfsinfo_output, re.MULTILINE)

 cluster_size = int(matches.group(1))

 print "Cluster size is {}".format(cluster_size)


 ph("Read directory tree")

 def read_path(path):
    ls_output = subprocess.check_output(['ntfsls', '-F', '-p', path.encode('utf-8'), device]).decode('utf-8').strip().split(u'\n')
    files = {}
    for filename in ls_output:
        if filename in [u'./', u'../']:
            continue
        
        if any(filename.endswith(c) for c in u'*@=|'):
            print("Ignoring unrecognized file type {}".format(filename))
            continue

        sys.stdout.write('\033[2K' + os.path.join(path, filename).encode('utf-8') + '\r')
        sys.stdout.flush()
        
        if filename.endswith(u'/'):
            files[filename] = read_path(os.path.join(path, filename[:-1]))
        else:
            files[filename] = os.path.join(path, filename)

    return files
    
 fs_tree = read_path(u'/')

 sys.stdout.write('\033[2K')
 sys.stdout.flush()

 all_file_paths = []
 def extract_file_paths(tree):
    for filename, path_or_contents in tree.iteritems():
        if filename.endswith(u'/'):
            extract_file_paths(path_or_contents)
        else:
            all_file_paths.append(path_or_contents)

 extract_file_paths(fs_tree)

 print('.')


 ph("Get run lists")

 affected_paths = {}

 all_file_paths_len = len(all_file_paths)

 for i, path in enumerate(all_file_paths):
    sys.stdout.write('\r\033[2K{}% {}'.format(int(100 * i / all_file_paths_len), path.encode('utf-8')))
    sys.stdout.flush()
    
    lines = subprocess.check_output(['ntfsinfo', '-v', '-F', path.encode('utf-8'), device]).decode('utf-8').strip().split('\n')
    runlist_lines = [i for i, l in enumerate(lines) if u'Runlist:' in l]

    overlaps = []
            
    for lineno in runlist_lines:
        next_line = lineno + 1
        while True:
            if next_line >= len(lines):
                break
            
            runlist_def = re.match(r'^[\t ]+0x[0-9a-f]+[\t ]+0x([0-9a-f]+)[\t ]+0x([0-9a-f]+)[\t ]*$', lines[next_line])
            if runlist_def is None:
                break
            
            start_cluster = int(runlist_def.group(1), 16)
            len_clusters = int(runlist_def.group(2), 16)

            start_offs = start_cluster * cluster_size
            length = len_clusters * cluster_size

            end_offs = start_offs + length

            for bc in bad_chunks:
                bc_start = bc[0]
                bc_end = bc[0] + bc[1]

                if max(start_offs, bc_start) < min(end_offs, bc_end):
                    affected_paths.setdefault(path, [])
                    affected_paths[path].append((start_offs, length, bc[0], bc[1]))
                    print
                    print "Bad chunk [file: offs {} size {}; bc: offs {} size {}]".format(start_offs, length, bc[0], bc[1])
                    print
            
            next_line += 1

 sys.stdout.write('\r\033[2K')
 sys.stdout.flush()


 ph('Done')

 if affected_paths:
    print "Files with problems:"
    for path in sorted(affected_paths.keys()):
        print u"   {}".format(path)

 else:
    print "No files affected"
	#!/usr/bin/env python

	import sys
	import os
	import subprocess
	import re
	from StringIO import StringIO

	# usage:
	#
	# first use gnu ddrescue on with a log file
	# then update the following variables

	# block device of your ntfs filesystem (probably should use the target, on your good drive)
	device = "/dev/sde2"

	# logfile produced by ddrescue
	log = "work.log"

	# difference between your ddrescue logfile offsets and your partition start, in bytes
	#
	# 1. if you ran ddrescue on your ntfs filesystem partition, put 0 here
	#
	# 2. if you ran ddrescue on the whole drive, get your filesystem offset from fdisk:
	#
	# fdisk /dev/sde
	# u (change display units, repeat until you're in sectors)
	# p (get the "start" for your filesystem device, and multiply it by the sector size in the top of the output; eg. 718848 * 512)
	log_adj = 368050176


	# that's it!

	def ph(message):
	stars = ''.join('' (len(message) + 2))

	print
	print stars
	print " {} ".format(message)
	print stars
	print


	ph("Read log")

	with open(log, 'rb') as lf:
	lines = list(lf)

	lines = [l for l in lines if not l.startswith('#')]
	bad_lines = [l for l in lines[1:] if l.strip().endswith('-')]
	matches = [re.match(r'0x([0-9A-F]+)[\t ]+0x([0-9A-F]+)', l.strip()) for l in bad_lines]
	bad_chunks = [(int(m.group(1), 16) - log_adj, int(m.group(2), 16)) for m in matches]

	print("Bad chunks:")
	for bc in bad_chunks:
	print " offs {} size {}".format(bc[0], bc[1])

	ph("Read filesystem metadata")

	ntfsinfo_output = subprocess.check_output(['ntfsinfo', '-m', device])

	matches = re.search(r'^[\t ]*Cluster Size: ([0-9]+)$', ntfsinfo_output, re.MULTILINE)

	cluster_size = int(matches.group(1))

	print "Cluster size is {}".format(cluster_size)


	ph("Read directory tree")

	def read_path(path):
	ls_output = subprocess.check_output(['ntfsls', '-F', '-p', path.encode('utf-8'), device]).decode('utf-8').strip().split(u'\n')
	files = {}
	for filename in ls_output:
	if filename in [u'./', u'../']:
	continue

	if any(filename.endswith(c) for c in u'*@=\|'):
	print("Ignoring unrecognized file type {}".format(filename))
	continue

	sys.stdout.write('\033[2K' + os.path.join(path, filename).encode('utf-8') + '\r')
	sys.stdout.flush()

	if filename.endswith(u'/'):
	files[filename] = read_path(os.path.join(path, filename[:-1]))
	else:
	files[filename] = os.path.join(path, filename)

	return files

	fs_tree = read_path(u'/')

	sys.stdout.write('\033[2K')
	sys.stdout.flush()

	all_file_paths = []
	def extract_file_paths(tree):
	for filename, path_or_contents in tree.iteritems():
	if filename.endswith(u'/'):
	extract_file_paths(path_or_contents)
	else:
	all_file_paths.append(path_or_contents)

	extract_file_paths(fs_tree)

	print('.')


	ph("Get run lists")

	affected_paths = {}

	all_file_paths_len = len(all_file_paths)

	for i, path in enumerate(all_file_paths):
	sys.stdout.write('\r\033[2K{}% {}'.format(int(100 * i / all_file_paths_len), path.encode('utf-8')))
	sys.stdout.flush()

	lines = subprocess.check_output(['ntfsinfo', '-v', '-F', path.encode('utf-8'), device]).decode('utf-8').strip().split('\n')
	runlist_lines = [i for i, l in enumerate(lines) if u'Runlist:' in l]

	overlaps = []

	for lineno in runlist_lines:
	next_line = lineno + 1
	while True:
	if next_line >= len(lines):
	break

	runlist_def = re.match(r'^[\t ]+0x[0-9a-f]+[\t ]+0x([0-9a-f]+)[\t ]+0x([0-9a-f]+)[\t ]*$', lines[next_line])
	if runlist_def is None:
	break

	start_cluster = int(runlist_def.group(1), 16)
	len_clusters = int(runlist_def.group(2), 16)

	start_offs = start_cluster * cluster_size
	length = len_clusters * cluster_size

	end_offs = start_offs + length

	for bc in bad_chunks:
	bc_start = bc[0]
	bc_end = bc[0] + bc[1]

	if max(start_offs, bc_start) < min(end_offs, bc_end):
	affected_paths.setdefault(path, [])
	affected_paths[path].append((start_offs, length, bc[0], bc[1]))
	print
	print "Bad chunk [file: offs {} size {}; bc: offs {} size {}]".format(start_offs, length, bc[0], bc[1])
	print

	next_line += 1

	sys.stdout.write('\r\033[2K')
	sys.stdout.flush()


	ph('Done')

	if affected_paths:
	print "Files with problems:"
	for path in sorted(affected_paths.keys()):
	print u" {}".format(path)

	else:
	print "No files affected"