Created
November 18, 2014 23:36
-
-
Save 3ki5tj/2ab5ed92e0e8ca67c119 to your computer and use it in GitHub Desktop.
Format leading spaces and tabs in C-like code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| ''' format the leading spaces and tabs | |
| Main functions: | |
| * tab2sp(): change leading tabs to spaces | |
| * reindent(): main function, reformat the leading indents | |
| reindentf(): wrapper for reindent, accepts a file | |
| * guessindent(): guess the indent size | |
| Helper class CParser | |
| * switchpp(): switch the current preprocessor state | |
| * switchcmt(): switch the current comment state | |
| Helper functions: | |
| * cleanline(): remove comments, then strip() | |
| * getindent(): get the leading indent of the line | |
| * endingcmt(): if a code line starts a block comment | |
| ''' | |
| import os, sys, getopt, re | |
| ''' ################## Helper functions begins ##################### ''' | |
| def cleanline(ln): | |
| ''' strip away comments and spaces ''' | |
| ln = re.sub(r"//.*", "", ln).strip() | |
| ln = re.sub(r"/\*.*?\*/", "", ln).strip() | |
| # filter out a partial comment `/* .... ' | |
| ln = re.sub(r"/\*.*?$", "", ln).strip() | |
| ln = re.sub(r"^.*?\*/", "", ln).strip() | |
| return ln | |
| def getindent(ln): | |
| ''' get the leading indent ''' | |
| for k in range(len(ln)): | |
| if not ln[k].isspace(): return ln[:k] | |
| else: | |
| return ln | |
| def endingcmt(ln, incmt = False): | |
| ''' check if this line ends with a block comment | |
| a=b; /* hahaha */ b=3; /* comment | |
| NOTE: it will fail if `/*' or '*/' is hidden in a string ''' | |
| if incmt: # finish the current comment | |
| m = re.match(r".*?\*/(.*)", ln) | |
| if m: ln = m.group(1) | |
| else: return True # the block comment remains | |
| while 1: | |
| m = re.match(r".*?(/\*.*?\*/)(.*)", ln) | |
| m1 = re.match(r".*?(/\*).*", ln) | |
| m2 = re.match(r".*?(//).*", ln) | |
| if m2: # has `//' | |
| if not m1: return False | |
| # both '//' and '/*' exist | |
| if m2.start(1) < m1.start(1): | |
| # `//' precedes `/*' | |
| #print "line comment:", ln[m2.start(1):] | |
| return False | |
| elif not m: | |
| # `/*' precedes `//', but no '*/' | |
| return True | |
| else: pass # closed `/* ... */', pass through | |
| if not m: # no more completed `/* ... */' | |
| return (m1 != None) | |
| else: | |
| ln = m.group(2) | |
| ''' ################## Helper functions ends ##################### ''' | |
| ''' ############## Helper class CParser begins ################### ''' | |
| class CParser: | |
| ppn = 0 # preprocessor level | |
| inpp = False # currently in a preprocessor | |
| incmt = False | |
| def __init__(self, ppn = 0, inpp = False, incmt = False): | |
| self.ppn = ppn | |
| self.inpp = inpp | |
| self.incmt = incmt | |
| def switchpp(self, ln): | |
| ''' switch the current state for preprocessors | |
| according to the input line `ln' | |
| return if the current line is in a preprocessor block ''' | |
| ln = ln.strip() | |
| # ignore the preprocessor, if it is in a comment | |
| if not self.incmt: | |
| if ln.startswith("#"): | |
| self.inpp = True | |
| if ln.startswith("#if"): | |
| self.ppn += 1 | |
| elif ln.startswith("#endif"): | |
| self.ppn = max(self.ppn - 1, 0) | |
| inppthis = self.inpp | |
| # compute the pp state of the next line | |
| if self.inpp: | |
| # end the processor, unless there's a line continuation | |
| if not ln.endswith("\\"): | |
| self.inpp = False | |
| return inppthis | |
| def switchcmt(self, ln): | |
| ''' switch the current state for block comments | |
| according to the input line `ln' | |
| an embedded comment does not counts: | |
| a = 3; /* assignment */ b = 4; | |
| return if the current line is in a comment block ''' | |
| ln = ln.strip() | |
| incmtthis = self.incmt | |
| if not self.incmt and ln.startswith("/*"): | |
| incmtthis = True | |
| self.incmt = endingcmt(ln, self.incmt) | |
| return incmtthis | |
| """ | |
| def printppcmt(s): | |
| ''' test: print preprocessor or comment lines ''' | |
| cp = CParser() | |
| for i in range(len(s)): | |
| inpp = cp.switchpp(s[i]) | |
| incmt = cp.switchcmt(s[i]) | |
| if inpp or incmt: print "%5d:%d,%5s|%5s|%s" % (i+1, cp.ppn, cp.inpp, cp.incmt, s[i]), | |
| """ | |
| ''' ############## Helper class CParser ends ################### ''' | |
| def guessindent(lines, defind, method, verbose = 0): | |
| ''' guess the basic unit of leading tabs ''' | |
| unitind = defind | |
| # 1. if the user insist | |
| if method == -1: | |
| return defind | |
| # 2. try to read the `tab-width' tag | |
| if method == 1: | |
| for i in range(len(lines)): | |
| if len(lines[i].strip()): break | |
| else: | |
| return defind | |
| ln = lines[i] | |
| if ln.startswith("/*"): | |
| m = re.search("tab-width:\s*([0-9]+);", ln) | |
| if m: | |
| if verbose: | |
| print "Tab-size according to `tab-width':", m.group(1) | |
| ntab = int(m.group(1)) | |
| return " " * ntab | |
| # 3. try to guess the indent size | |
| lntab = 0 # number of lines that uses tabs | |
| lnsp = [0] * 100 # lnsp[k] == number of lines that uses k-spaces | |
| cp = CParser() | |
| lnum = 0 | |
| for line in lines: | |
| lnum += 1 | |
| ln = line.rstrip() | |
| # skip preprocessor and comments | |
| if cp.switchpp(ln): continue | |
| if cp.switchcmt(ln): continue | |
| ind = getindent(ln) | |
| if len(ind) == 0: continue | |
| # update the # of lines that use Tab as indents | |
| l = ind.find("\t") | |
| if l >= 0: | |
| ind = ind[:l] | |
| lntab += 1 | |
| # compute the number of spaces in the indent | |
| ns = len(ind) | |
| if ns == 0: continue | |
| if ns % 2 != 0 and verbose >= 3: | |
| print "strange indents", ns, ":", lnum, ":", line.strip() | |
| if ns < len(lnsp): lnsp[ns] += 1 | |
| nsp = lnsp.index( max(lnsp) ) | |
| if nsp % 2 == 0: # use a heuristic formula | |
| nsp = 8 | |
| if lnsp[4] >= 0.2 * lnsp[8]: | |
| nsp = 4 | |
| if lnsp[2] >= 0.2 * lnsp[4]: | |
| nsp = 2 | |
| if verbose: | |
| print "tabs:", lntab, "lines, spaces:", lnsp[1:9], "lines, indent", nsp | |
| if lntab > lnsp: # more lines uses tabs | |
| return "\t" | |
| else: # more lines uses spaces | |
| return " " * nsp | |
| def lntab2sp(s, tab): | |
| ''' convert tab into spaces ''' | |
| s1 = "" | |
| for k in range(len(s)): | |
| if s[k] == '\t': | |
| s1 += " " * int( (len(s1) + tab) / tab ) * tab | |
| return s1 | |
| def tab2sp(s0, tabsize = 4, verbose = 0): | |
| ''' remove leading tabs in the indent to spaces | |
| called in `reindent' ''' | |
| s1 = [ln.rstrip() + '\n' for ln in s0] | |
| n = len(s0) | |
| iprev = 0 | |
| cp = CParser() | |
| for i in range(n): | |
| # skip preprocessor and comments | |
| if cp.switchpp(s0[i]): continue | |
| if cp.switchcmt(s0[i]): continue | |
| ind0 = getindent(s0[i]) | |
| # skip if the previous line has no tab | |
| if '\t' in ind0: | |
| # compute the proper tabsize | |
| indprev0 = getindent(s0[iprev]) | |
| indprev1 = getindent(s1[iprev]) | |
| if indprev0 == ind0: | |
| # if the indent is the same as the previous line, use the old tab | |
| ind1 = indprev1 | |
| else: | |
| # try different tab sizes, and choose the proper one | |
| for tb in [2, 4, 8]: | |
| ind1 = lntab2sp(ind0, tb) | |
| if len(ind1) >= len(indprev1): | |
| break | |
| if verbose >= 2: | |
| print "TAB2SP line %s, [%s|%s]%s" % (i+1, indprev1, ind1, s0[i].strip()) | |
| # apply the new indent | |
| s1[i] = ind1 + s1[i].lstrip() | |
| iprev = i | |
| return s1 | |
| def reindent(s0, defind = " ", indmethod = 1, verbose = 0): | |
| ''' reformat the leading indents | |
| when `{' is encountered, an indent is added on the next line | |
| when `}' is encountered, the indent is removed | |
| for other delibrate indents, | |
| the algorithm looks for the incremental indent difference | |
| between the current line and previous line, and try to | |
| the mimic the difference in the output ''' | |
| # remove trailing spaces | |
| s0 = [ln.rstrip() + '\n' for ln in s0] | |
| unitind = guessindent(s0, defind, indmethod, verbose) | |
| # change leading tabs to spaces | |
| if unitind != '\t': | |
| s1 = tab2sp(s0, len(unitind), verbose) | |
| else: | |
| s1 = s0[:] | |
| level = 0 | |
| levels = [ level ] * len(s0) | |
| indents = [ "" ] * 100 | |
| iprev = 0 | |
| cp = CParser() | |
| n = len(s0) | |
| for i in range(n): | |
| ln = s0[i].rstrip() | |
| lnstrip = ln.strip() | |
| if len(lnstrip) == 0: # blank line | |
| continue | |
| # preprocessor control | |
| if cp.switchpp(lnstrip): continue | |
| # control comments | |
| incmt = cp.switchcmt(lnstrip) | |
| # if incmt: continue # do not format comment lines | |
| # start formating | |
| lnclean = cleanline(ln) | |
| follow = 0 | |
| indincr = inddecr = "" | |
| # if this line has no indent, reset | |
| if not ln[0].isspace(): | |
| # avoid resetting if it is a label line like `EXIT:' | |
| if re.match(r"\w+:", lnclean): | |
| s1[i] = s0[i].strip() + '\n' | |
| continue | |
| elif level != 0: | |
| if incmt: # a quick fix for mdrun.c | |
| continue | |
| if verbose >= 3: | |
| print "RESET indent level %s at line %s: %s" % (level, i+1, lnstrip) | |
| # reset the level, nasty trick, should be avoided | |
| level = 0 | |
| elif i > 0: | |
| # compute the indent difference from the previous line | |
| indthis = getindent(s0[i]) | |
| indprev = getindent(s0[iprev]) | |
| # follow this line if it has the same indent as the previous one | |
| if indthis == indprev: | |
| follow = 1 | |
| # register the amount of indent in the source | |
| if indthis.startswith(indprev): | |
| indincr = indthis[len(indprev):] | |
| elif indprev.startswith(indthis): | |
| inddecr = indprev[len(indthis):] | |
| # adjust the indent level, before the current line | |
| if lnclean.startswith("}"): | |
| if verbose >= 3: | |
| print "UNDENT line", i+1, "level", level, ":", s1[i].strip() | |
| level = max(0, level - 1) | |
| levels[i] = level | |
| # compute the indent for the current line | |
| if follow: # following the previous line's indent | |
| indent = getindent(s1[iprev]) | |
| else: | |
| # indent = unitind * level | |
| indent = indents[level] | |
| # if this line shares the same level with the previous one | |
| # but it appears to have longer/shorter indent, | |
| # then try to mimic the source | |
| if i > 0 and level == levels[iprev]: | |
| indprev = getindent( s1[iprev] ) | |
| if len(indincr) > 0: | |
| indent = indprev + indincr | |
| elif len(inddecr) > 0 and indprev.endswith(inddecr): | |
| indent = indprev[:-len(inddecr)] | |
| # apply the indent for the current line | |
| s1[i] = indent + s0[i].strip() + '\n' | |
| # DEBUGGING BLOCK | |
| if 0 and i == 381: | |
| print "DEBUG iprev %s, i %s, [%s,%s]" % (iprev, i, getindent(s0[iprev]), getindent(s0[i])) | |
| print "DEBUG indent [%s], level %d, %d, indprev [%s] indincr [%s] inddecr [%s] follow %s\n%s" % ( | |
| indent, level, levels[iprev], indprev, indincr, inddecr, follow, lnstrip) | |
| raw_input() | |
| # adjust the indent level for the ensuing lines | |
| iref = -1 | |
| if lnclean.endswith("{"): | |
| iref = i | |
| # if we encounter an ending `) {' | |
| # try to find its starter if/else/for/while | |
| # s.t. we can reset the indent level | |
| if re.search(r"\)\s*{$", lnclean): | |
| while iref >= 0: | |
| lnref = cleanline( s0[iref] ) | |
| if lnref == "": # gone too far | |
| iref += 1 | |
| break | |
| if ( lnref.startswith("if") | |
| or lnref.startswith("for") | |
| or lnref.startswith("while") | |
| or re.search(r"^(})?\s*else", lnref) ): | |
| #print "ref-ln %d -> %d" % (i, iref) | |
| break | |
| iref -= 1 | |
| ss = getindent(s1[iref]) | |
| if verbose >= 3: | |
| print "INDENT line", i+1, "level", level, "iref", iref+1, ":", s1[iref].strip() | |
| # try to update also the indent of this level | |
| # as long as this is not the ground level, | |
| if level > 0: indents[ level ] = ss | |
| indents[ level+1 ] = ss + unitind | |
| level += 1 | |
| # The difference does not count the trailing spaces | |
| if s1[i] != s0[i] and verbose >= 2: | |
| print "DIFF: %s, prev-level %s, level %s, next-level %s, indent %s, [%s], incmt %s, iprev %s, iref %s\nold:%snew:%s" % ( | |
| i+1, levels[iprev], levels[i], level, len(indent), indent, cp.incmt, iprev, iref, | |
| s0[i], s1[i]), | |
| # register the last good code line | |
| iprev = i | |
| # end of the main loop | |
| return s1 | |
| def reindentf(fn, fnout, defind, indmethod, verbose = 0): | |
| ''' reindent a file, wrapper ''' | |
| try: | |
| s0 = open(fn).readlines() | |
| except Exception: | |
| print "cannot open %s" % fn | |
| return | |
| s1 = reindent(s0, defind, indmethod, verbose) | |
| # be careful when rewritting the file | |
| if ''.join(s0) == ''.join(s1): | |
| print "keeping", fn | |
| else: | |
| if not fnout: | |
| fnout = fn | |
| if verbose >= 1: | |
| yesno = raw_input("ok?").strip() | |
| if not yesno or not yesno[0] in "yYoO": | |
| print "abort" | |
| exit(1) | |
| try: # make a backup | |
| import zcom | |
| zcom.safebackup(fn, ".bak") | |
| except ImportError: pass | |
| print "writing", fnout | |
| open(fnout, "w").writelines(s1) | |
| def usage(): | |
| """ print usage and die """ | |
| print sys.argv[0], "[Options] input" | |
| print """ | |
| Reformat leading indents (spaces and tabs) | |
| OPTIONS: | |
| -t: by default, use tab instead of spaces | |
| -s n, --spaces=n: the default number `n' of spaces of an indent | |
| -f, --force: always use the default indent | |
| -R, --recursive: apply to subdirectories, if `input' is | |
| a wildcard pattern like `*.c', | |
| the pattern must be quoted as '*.c' | |
| -L, --nolinks: skip symbolic links | |
| -o, --output: output file | |
| --noemacs: don't trust Emacs tag for the tab size | |
| -v: be verbose | |
| --verbose=n: specify the verbosity | |
| """ | |
| exit(1) | |
| def doargs(): | |
| ''' Handle common parameters from command line options ''' | |
| try: | |
| opts, args = getopt.gnu_getopt(sys.argv[1:], "ts:fRLo:hv", | |
| [ "spaces=", "force", "recursive", | |
| "nolinks", "output=", | |
| "noemacs", "verbose=", "help", ] ) | |
| except getopt.GetoptError, err: | |
| # print help information and exit: | |
| print str(err) # will print something like "option -a not recognized" | |
| usage() | |
| recur = False | |
| links = True | |
| fnout = None | |
| defind = " " | |
| indmethod = 1 # 1: trust Emacs tag, 0: compute naturally, -1: always use `defind' | |
| verbose = False | |
| for o, a in opts: | |
| if o in ("-f", "--force",): | |
| indmethod = -1 | |
| elif o in ("-s", "--spaces",): | |
| defind = " " * int(a) | |
| elif o in ("-t", "--tab",): | |
| defind = "\t" | |
| elif o in ("-R", "--recursive",): | |
| recur = True | |
| elif o in ("-L", "--nolinks",): | |
| links = False | |
| elif o in ("-o", "--output",): | |
| fnout = a | |
| elif o in ("--noemacs",): | |
| if indmethod > 0: # no effect if `--force' is set | |
| indmethod = 0 | |
| elif o in ("-v",): | |
| verbose += 1 | |
| elif o in ("--verbose",): | |
| verbose = int(a) | |
| elif o in ("-h", "--help",): | |
| usage() | |
| ls = args | |
| try: # limit the dependence on argsglob | |
| import zcom | |
| ls = zcom.argsglob(args, "*.c *.cpp *.h *.hpp *.java", recur = recur, links = links) | |
| except ImportError: pass | |
| if len(ls) <= 0: print "no file for %s" % args | |
| return ls, fnout, defind, indmethod, verbose | |
| if __name__ == "__main__": | |
| fns, fnout, defind, indmethod, verbose = doargs() | |
| for fn in fns: | |
| reindentf(fn, fnout, defind, indmethod, verbose) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment