Skip to content

Instantly share code, notes, and snippets.

@rolicot
Last active March 6, 2026 14:01
Show Gist options
  • Select an option

  • Save rolicot/85fa0c7558844fca2023 to your computer and use it in GitHub Desktop.

Select an option

Save rolicot/85fa0c7558844fca2023 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Detects renamed and/or moved files by tracking inodes.
Creates a shell script to replay similar changes. Make sure to use relative
paths if you want to replay changes in a different absolute location. Does not
follow symbolic links. Inode numbers must be identical (do not cross
filesystems)!
'''
__author__ = 'Pavel Krc'
__email__ = '[email protected]'
__version__ = '1.1'
__copyright__ = 'Copyright (C) 2015 Pavel Krc'
__license__ = 'GPLv2+'
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import os
import re
generate_python_scripts = True # instead of shell scripts (more reliable)
if generate_python_scripts:
script_header = '''#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
def ren(a, b):
print(b)
assert(not os.path.exists(b))
os.rename(a, b)
def mkd(d):
print(d)
os.mkdir(d)
'''
mv_cmd = 'ren("{0}", "{1}")\n'
mkdir_cmd = 'mkd("{0}")\n'
escaped_chars = re.compile(r'(["\\])')
esc = lambda s: escaped_chars.sub(r'\\\1', s)
else:
script_header = '#!/bin/sh\nset -e -v\n\n'
mv_cmd = 'mv -n -T -- "{0}" "{1}"\n'
mkdir_cmd = 'mkdir -- "{0}"\n'
# Since single quotes cannot be escaped in sh, I have to use double qoutes
# and do a little bit more escaping. Fortunately regex does it efficiently.
escaped_chars = re.compile(r'([`"\$\\])')
esc = lambda s: escaped_chars.sub(r'\\\1', s)
def dump_inodes(root, log_path):
# must be top-down for reconstruction
with open(log_path, 'w') as o:
o.write('D {0:d} {1}\n'.format(os.lstat(root).st_ino, root))
for dpath, dnames, fnames in os.walk(root):
dpath += '/'
for n in dnames:
p = dpath + n
o.write('D {0:d} {1}\n'.format(os.lstat(p).st_ino, p))
for n in fnames:
p = dpath + n
o.write('F {0:d} {1}\n'.format(os.lstat(p).st_ino, p))
class DirEntry(object):
__slots__ = ['path', 'parent', 'dirs', 'files']
def __init__(self, path, parent):
self.path = path
self.parent = parent
self.dirs = set()
self.files = set()
class FileEntry(object):
__slots__ = ['path', 'parent']
def __init__(self, path, parent):
self.path = path
self.parent = parent
class MovingTree(object):
def __init__(self, log_path):
self.dirs = {}
self.files = {}
revdirs = {}
with open(log_path) as i:
# root entry
df, ino, path = next(i).rstrip('\n').split(' ', 2)
ino = int(ino)
assert df == 'D'
self.root = path
self.dirs[ino] = DirEntry(path, None)
revdirs[path] = ino
for ln in i:
df, ino, path = ln.rstrip('\n').split(' ', 2)
ino = int(ino)
parent_ino = revdirs[path.rsplit('/', 1)[0]]
if df == 'D':
self.dirs[ino] = DirEntry(path, parent_ino)
revdirs[path] = ino
self.dirs[parent_ino].dirs.add(ino)
elif df == 'F':
self.files[ino] = FileEntry(path, parent_ino)
self.dirs[parent_ino].files.add(ino)
else:
raise RuntimeError()
def create_script(self, script_path):
# uses os.open to create executable script - still, read it first!
cls = lambda: None
try:
fd = os.open(script_path, os.O_CREAT|os.O_WRONLY|os.O_TRUNC, 0o777)
cls = lambda: os.close(fd)
o = os.fdopen(fd, 'w')
cls = o.close
o.write(script_header)
self.detect_changes(o)
finally:
cls()
def update_children(self, entry, orig_p, new_p):
l = len(orig_p)
for i in entry.dirs:
centry = self.dirs[i]
assert centry.path[:l] == orig_p
centry.path = new_p + centry.path[l:]
self.update_children(centry, orig_p, new_p)
for i in entry.files:
centry = self.files[i]
assert centry.path[:l] == orig_p
centry.path = new_p + centry.path[l:]
def detect_changes(self, script):
# The order of detecting changes is important. The safest order I could
# think of was to start top-bottom according to destination (i.e.
# safely constructing new state with guaranteed existing parents),
# updating source data structures where necessary.
newfiles = []
ok_dirs = ok_files = 0
for dpath, dnames, fnames in os.walk(self.root):
dpath += '/'
for n in dnames:
p = dpath + n
ino = os.lstat(p).st_ino
try:
orig_entry = self.dirs.pop(ino)
except KeyError:
# new directory
script.write(mkdir_cmd.format(esc(p)))
else:
# existing directory
if orig_entry.path == p:
ok_dirs += 1
else:
# moved
script.write(mv_cmd.format(esc(orig_entry.path), esc(p)))
# disparent self
try:
parent_entry = self.dirs[orig_entry.parent]
except KeyError:
pass #parent already processed
else:
parent_entry.dirs.remove(ino)
# moving under either freshly created or already
# processed dir, so no need to register under new
# parent.
# update all children in the source tree
self.update_children(orig_entry, orig_entry.path+'/', p+'/')
for n in fnames:
p = dpath + n
ino = os.lstat(p).st_ino
try:
orig_entry = self.files.pop(ino)
except KeyError:
# new file - just log
newfiles.append(p)
else:
# existing file
if orig_entry.path == p:
ok_files += 1
else:
# moved
script.write(mv_cmd.format(esc(orig_entry.path), esc(p)))
# disparent self
try:
parent_entry = self.dirs[orig_entry.parent]
except KeyError:
pass #parent already processed
else:
parent_entry.files.remove(ino)
# list remaining unprocessed
script.write('\n### Deleted directories ###\n')
for p in sorted(e.path for e in self.dirs.values()
if e.path != self.root):
script.write('#{0}\n'.format(p))
script.write('\n### Deleted files ###\n')
for p in sorted(e.path for e in self.files.values()):
script.write('#{0}\n'.format(p))
script.write('\n### Newly created files ###\n')
for p in newfiles:
script.write('#{0}\n'.format(p))
script.write('\n### {0:d} dirs and {1:d} files have remained in place. ###\n'
.format(ok_dirs, ok_files))
if __name__ == '__main__':
action = sys.argv[1:2]
if action == ['dump']:
dump_inodes(sys.argv[2], sys.argv[3])
elif action == ['detect']:
tr = MovingTree(sys.argv[2])
tr.create_script(sys.argv[3])
else:
sys.exit('''Usage:
{0} dump {{root_path}} {{inode_list_path}}
Dumps inode numbers inside {{root_path}} to a new
file {{inode_list_path}}, thus recording current state.
{0} detect {{inode_list_path}} {{script_path}}
Compares recorded state within {{inode_list_path}} with current
state and creates a script to reconstruct detected changes.
'''.format(sys.argv[0]))
@Pegsneggle
Copy link

Many thanks for producing this code. I have tried it and it works very well. It did cause me some concern, however, until I better understood what was happening.

First of all it works perfectly for file renames and moves.

On the other hand, if an existing file is opened, modified and saved, you might not get the expected result. Let's say the directory under analysis is a Samba share, there's every chance the files therein were created in Microsoft Office. Word and Excel don't directly save files back to the original. A temporary file is created. If that save is successful, the original is deleted and the temporary file is renamed. This means the file ends up with a different inode to the one it started with.

The original inode is now available for re-use. Another unrelated file save could then use the discarded inode. This leads to mistaken rename activity by this program. However, if an rsync backup follows, the mistakes are then corrected. The benefits bestowed on renames and moves far outweigh the mistakes made during open, modify and save activity.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment