Last active
March 15, 2022 11:52
-
-
Save AntumDeluge/fbf943541e6f6be50c7c0c155dd07ab0 to your computer and use it in GitHub Desktop.
Script to clean leading & trailing whitespace in text files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
## The MIT License (MIT) | |
# | |
# Copyright © 2022 Jordan Irwin (AntumDeluge) | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of | |
# this software and associated documentation files (the "Software"), to deal in | |
# the Software without restriction, including without limitation the rights to | |
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
# of the Software, and to permit persons to whom the Software is furnished to do | |
# so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
## SOFTWARE. | |
import os, sys, errno, platform, codecs, traceback | |
from enum import Enum | |
## Converts a type to string. | |
def toString(t, delim=None): | |
t_type = type(t); | |
if t_type == tuple or t_type == list: | |
endcaps = "()"; | |
if t_type == list: | |
endcaps = "[]"; | |
if delim == None: | |
tmp = endcaps[0]; | |
for v in t: | |
if tmp != endcaps[0]: | |
tmp = tmp + ","; | |
tmp = tmp + str(v); | |
t = tmp + endcaps[1]; | |
else: | |
tmp = []; | |
for v in t: | |
tmp.append(str(v)); | |
t = delim.join(tmp); | |
return t; | |
## Debugging output level. | |
class Level(Enum): | |
SILENT = 0; | |
ERROR = 1; | |
WARN = 2; | |
INFO = 3; | |
DEBUG = 4; | |
VERBOSE = 5; | |
## Message printing/logging. | |
# | |
# @param msg | |
# Message to be printed to console. | |
# @param lvl | |
# Debugging level (default: INFO). | |
def log(msg=None, lvl=Level.INFO): | |
if (type(lvl) != int): | |
lvl = lvl.value; | |
if lvl == Level.SILENT.value: | |
return; | |
global_level = options["level"]; | |
if (type(global_level) != int): | |
global_level = global_level.value; | |
if global_level == Level.SILENT.value or lvl > global_level: | |
return; | |
# write newline for empty messages | |
if msg == None: | |
sys.stdout.write("\n"); | |
return; | |
if type(msg) != str: | |
msg = toString(msg); | |
stream = sys.stdout; | |
prefix = ""; | |
if lvl == Level.ERROR.value: | |
stream = sys.stderr; | |
prefix = "ERROR"; | |
elif lvl == Level.WARN.value: | |
prefix = "WARNING"; | |
if prefix != "": | |
prefix = prefix + ": "; | |
stream.write(prefix + msg + "\n"); | |
ver_py_min = (3, 0, 0); | |
ver_py = []; | |
# convert version tuple to integer values | |
for v in platform.python_version_tuple(): | |
ver_py.append(int(v)); | |
ver_py = tuple(ver_py); | |
if ver_py < ver_py_min: | |
msg = "\nERROR: incompatible Python version " + toString(ver_py, ".") + "\n \ | |
requires version " + toString(ver_py_min, ".") + " or later" | |
log(msg); | |
sys.exit(1); | |
file_exe = os.path.basename(__file__); | |
dir_tools = os.path.normpath(os.path.dirname(__file__)); | |
dir_root = os.getcwd(); | |
scount_default = 4; | |
level_default = Level.INFO; | |
# flag type options | |
flags = { | |
"help": False, | |
"fake": False, | |
"convert-le": False, | |
"notrail": False, | |
"nolead": False, | |
} | |
# all available parameters ("None" denotes parameter is required) | |
options = { | |
"dirs": None, | |
"filetypes": None, | |
"scount": scount_default, | |
"level": level_default.value, | |
} | |
# add flags to all parameters | |
for opt in flags: | |
options[opt] = flags[opt]; | |
# alternative shorthands for parameters | |
short_options = { | |
"h": "help", | |
"d": "dirs", | |
"f": "filetypes", | |
"s": "scount", | |
"x": "fake", | |
"l": "level", | |
} | |
class OptType(Enum): | |
NONE = ("none", ""); | |
FLAG = ("flag", ""); | |
BOOL = ("bool", ""); | |
STRING = ("string", ""); | |
LIST = ("list", ""); | |
INT = ("int", ""); | |
def __init__(self, identifier, description): | |
super(); | |
self.__name__ = identifier; | |
self.desc = description; | |
# FIXME: doesn't work | |
def __eq__(self, other): | |
self.equals(other); | |
def equals(self, other): | |
# treat tuples & lists the same | |
if (other == tuple): | |
other = list; | |
return other.__name__ == self.__name__; | |
def getId(self): | |
return self.__name__; | |
def getDescription(self): | |
return self.desc; | |
def getOptionType(opt): | |
if (opt not in options): | |
return OptType.NONE; | |
if (opt in flags): | |
return OptType.FLAG; | |
val = options[opt]; | |
# default to list | |
if (val == None): | |
return OptType.LIST; | |
for t in OptType: | |
if (t.equals(type(val))): | |
return t; | |
return OptType.NONE; | |
def toBoolean(val): | |
if (type(val) == bool): | |
return val; | |
if (val.lower() in ("y", "yes", "true")): | |
return True; | |
return False; | |
def toInt(val): | |
return int(val); | |
def optionIsRequired(opt): | |
return opt in req_options; | |
def setOption(opt, value): | |
options[opt] = value; | |
## Displays usage help text. | |
def showUsage(): | |
msg = "\nUsage:" \ | |
+ "\n {} -f <filtypes>[ -d <dirs>][ <flags>]".format(file_exe) \ | |
+ "\n {} -h".format(file_exe) \ | |
+ "\n\nOptions:" \ | |
+ "\n -f|--filetypes:\tComma-seprated list of filename extensions to parse." \ | |
+ "\n -d|--dirs:\t\tComma-separated list of directories to search." \ | |
+ "\n -s|--scount:\t\tNumber of leading spaces to replace with tab" \ | |
+ " (default: {}).".format(scount_default) \ | |
+ "\n -l|--level:\t\tLogging level (default: {}).".format(level_default.value) \ | |
+ "\n\nFlags:" \ | |
+ "\n -h|--help:\t\tShow usage information." \ | |
+ "\n -x|--fake:\t\tSimulate (don't apply changes)." \ | |
+ "\n --convert-le:\t\tConvert CR/CRLF line endings to LF." \ | |
+ "\n --nolead:\t\tDon't replace leading spaces with tabs." \ | |
+ "\n --notrail:\t\tDon't clean trailing whitespace." | |
log(msg); | |
## Displays an error message & exits the process. | |
# | |
# @param code | |
# Exit code to use. | |
# @param msg | |
# Message to display. | |
# @param[opt] usage | |
# If <code>True</code>, shows usage information before exiting. | |
def exitWithError(code, msg, usage=True): | |
log(); | |
log(msg, Level.ERROR); | |
if (usage): | |
showUsage(); | |
sys.exit(code); | |
## Compatibility function for case matching in different Python versions. | |
# | |
# @param match | |
# The statement to be compared. | |
# @param cases | |
# Cases to be checked. | |
def switch(match, cases, default=None): | |
if (match in cases): | |
return cases[match]; | |
if (default != None): | |
return default; | |
return match; | |
## Parses command line arguments & sets up file & directory options. | |
# | |
# @param args | |
# List of args to parse. | |
def parseArgs(args, flags=False): | |
idx = 0; | |
while (idx < len(args)): | |
cur_arg = args[idx]; | |
s_arg = cur_arg[0] == "-" and cur_arg.count("-") == 1; | |
l_arg = cur_arg[0:2] == "--" and len(cur_arg) - len(cur_arg.lstrip("-")) == 2; | |
# all accepted arguments use a switch ("-") | |
if not (s_arg or l_arg): | |
exitWithError(errno.EINVAL, "malformatted argument: {}".format(cur_arg)); | |
cur_arg = cur_arg.lstrip("-"); | |
if (s_arg): | |
if (len(cur_arg) > 1): | |
# parse individual short args that are grouped together | |
grouped_flags = []; | |
for c in cur_arg: | |
# make sure flags got split into individual characters | |
if (len(c) > 1): | |
exitWithError(1, "internal error parsing grouped flags: {}".format(cur_arg)); | |
grouped_flags.append("-" + c); | |
parseArgs(grouped_flags, True); | |
idx = idx + 1 | |
continue; | |
else: | |
cur_arg = switch(cur_arg, short_options); | |
if (not (cur_arg in options)): | |
exitWithError(errno.EINVAL, "unknown argument: {}".format(cur_arg)); | |
otype = OptType.getOptionType(cur_arg); | |
if (not otype.equals(OptType.FLAG)): | |
if (flags): | |
exitWithError(1, "argument \"{}\" is not a flag type & cannot be grouped".format(cur_arg)); | |
# arguments must have a parameter | |
if (len(args) == idx + 1): | |
exitWithError(1, "argument \"{}\" requires a value".format(cur_arg)); | |
val = None; | |
if (otype.equals(OptType.FLAG)): | |
val = True; | |
else: | |
if (not args[idx + 1].startswith("-")): | |
# value is located at next index | |
val = args[idx + 1]; | |
idx = idx + 1; | |
if (otype.equals(OptType.BOOL)): | |
val = OptType.toBoolean(val); | |
elif (otype.equals(OptType.INT)): | |
val = OptType.toInt(val); | |
elif (otype.equals(OptType.LIST)): | |
if ("," in val): | |
val = tuple(val.split(",")); | |
else: | |
val = tuple([val]); | |
options[cur_arg] = val; | |
idx = idx + 1; | |
if (len(sys.argv) == 1): | |
exitWithError(1, "missing parameters"); | |
parseArgs(sys.argv[1:]); | |
if (options["help"]): | |
showUsage(); | |
sys.exit(0); | |
single_file = False; | |
in_paths = options["dirs"]; | |
if (in_paths and len(in_paths) == 1 and os.path.isfile(in_paths[0])): | |
single_file = True; | |
for opt in options: | |
if (opt == "filetypes" and single_file): | |
continue; | |
if (options[opt] == None): | |
exitWithError(1, "missing required argument: {}".format(opt)); | |
# the actual work | |
apply_changes = not options["fake"]; | |
if (not apply_changes): | |
log("\nsimulation run, changes will not be applied\n"); | |
scount = options["scount"]; | |
spaces_prefix = ""; | |
for num in range(scount): | |
spaces_prefix = spaces_prefix + " "; | |
def replaceLeadingSpaces(line): | |
lcontent = line.lstrip(); | |
idx = len(line) - len(lcontent); | |
pre = line[:idx]; | |
if (spaces_prefix in pre): | |
pre = "\t".join(pre.split(spaces_prefix)); | |
return pre + lcontent; | |
cleaned_count = 0; | |
def checkFile(f): | |
# not sure why this needs declared here | |
global cleaned_count; | |
contents_orig = None; | |
contents_new = []; | |
try: | |
buffer = codecs.open(f, "r", "utf-8"); | |
if (not buffer): | |
exitWithError(1, "could not open file for reading: {}".format(f), False); | |
contents_orig = buffer.read(); | |
buffer.close(); | |
except UnicodeDecodeError: | |
exitWithError(1, \ | |
"could not read file {}, please check that it is a text file".format(f), \ | |
False); | |
st_orig = contents_orig; | |
# line endings to be written to output | |
le = "\n"; | |
le_name = "LF"; | |
# ensure we are working with LF line endings | |
if ("\r\n" in contents_orig): | |
contents_orig = contents_orig.replace("\r\n", "\n"); | |
# preserve line endings | |
if (not options["convert-le"]): | |
le = "\r\n"; | |
le_name = "CRLF"; | |
if ("\r" in contents_orig): | |
contents_orig = contents_orig.replace("\r", "\n"); | |
# preserve line endings | |
if (not options["convert-le"]): | |
le = "\r"; | |
le_name = "CR"; | |
contents_orig = tuple(contents_orig.split("\n")); | |
idx = 0; | |
for line in contents_orig: | |
line_orig = line; | |
if (not options["notrail"]): | |
# clean trailing whitespace | |
line = line.rstrip(); | |
if (not options["nolead"]): | |
# replace leading spaces with tabs | |
line = replaceLeadingSpaces(line); | |
if (line != line_orig): | |
log("cleaned line {} ({})".format(idx + 1, f), Level.DEBUG); | |
contents_new.append(line); | |
idx = idx + 1; | |
st_new = None; | |
if (le != "\n"): | |
log("preserving line endings \"{}\" in file: {}".format(le_name, f), Level.DEBUG); | |
st_new = le.join(contents_new); | |
else: | |
st_new = "\n".join(contents_new); | |
if (st_new != st_orig): | |
try: | |
if (apply_changes): | |
buffer = codecs.open(f, "w", "utf-8"); | |
buffer.write(st_new); | |
buffer.close(); | |
cleaned_count = cleaned_count + 1; | |
log("updated file: {}".format(f)); | |
except: | |
exitWithError(1, "error while opening file for writing: {}\n{}" \ | |
.format(f, traceback.format_exc()), False); | |
if (single_file): | |
checkFile(in_paths[0]); | |
else: | |
# check that all directories exist before doing anything | |
for d in in_paths: | |
if (not os.path.isdir(d)): | |
exitWithError(errno.ENOENT, "file or directory not found: {}".format(d)); | |
for d in in_paths: | |
for ROOT, DIRS, FILES in os.walk(d): | |
for FILE in FILES: | |
f = os.path.join(ROOT, FILE); | |
fsuffix = f.split(".")[-1]; | |
if (fsuffix in options["filetypes"]): | |
checkFile(f); | |
log("\ncleaned {} files".format(cleaned_count)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment