Created
December 2, 2013 07:22
-
-
Save bjorndown/7746200 to your computer and use it in GitHub Desktop.
Recursively scans a directory for text files and finds those files not mentioned by name in any file within this tree.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
class ReferenceCollector: | |
def __init__(self, pattern, ignored_files): | |
self.files = {} | |
self.ignored_files = ignored_files | |
self.PATTERN = pattern | |
def run_scan(self, dir): | |
"""Scan given directory for unreferenced files.""" | |
for root, dirs, files in os.walk(os.path.normpath(dir)): | |
for filename in files: | |
self.register(root, filename) | |
self.scan_content_for_references(root, filename) | |
def register(self, path, filename): | |
"""Initially register file if unknown.""" | |
if not filename in self.ignored_files and not filename in self.files: | |
self.files[filename] = { "references": 0, "path": path } | |
def scan_content_for_references(self, path, filename): | |
"""Scan content of given file for references to other files.""" | |
content = self._read_file(path, filename) | |
referenced_files = self._extract_file_references(content) | |
for referenced_file in referenced_files: | |
if not referenced_file in self.files: | |
self.register(path, referenced_file) | |
self.files[referenced_file]["references"] += 1 | |
def _extract_file_references(self, content): | |
matches = re.findall(self.PATTERN, content) | |
referenced_files = [groups[0] for groups in matches] | |
return self._strip_path(referenced_files) | |
def _read_file(self, path, filename): | |
with open(os.path.join(path, filename), "r") as file: | |
content = file.read() | |
return content | |
def _strip_path(self, files): | |
filenames = [] | |
for file in files: | |
filenames.append(os.path.basename(file)) | |
return filenames | |
def get_unreferenced_files(self): | |
"""Return unreferenced files with fully qualified path.""" | |
unreferenced = [] | |
for filename, data in self.files.items(): | |
if data["references"] == 0: | |
unreferenced.append(os.path.join(data["path"], filename)) | |
unreferenced.sort() | |
return unreferenced | |
def get_number_of_scanned_files(self): | |
return len(self.files) | |
if __name__ == "__main__": | |
reference_collector = ReferenceCollector(pattern="([/\w-]+\.(xml|sql))", ignored_files=["ignore.xml"]) | |
reference_collector.run_scan(sys.argv[1]) | |
print("Number of scanned files: {}".format(reference_collector.get_number_of_scanned_files())) | |
print("Number of unreferenced file(s): {}".format(len(reference_collector.get_unreferenced_files()))) | |
print("Unreferenced file(s):") | |
for file in reference_collector.get_unreferenced_files(): | |
print(file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment