Created
September 28, 2017 09:22
-
-
Save deeplook/23f666e73ffe9c41ba91ae4fbbf4d094 to your computer and use it in GitHub Desktop.
Spot single tokens (like the 'var' keyword) in Scala source code files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| List keyword tokens in Scala source code. | |
| Originally intended for spotting lines containing "var" declarations | |
| in Scala source files (hence the name "scala varning") which are | |
| somewhat considered "harmful" in Scala. The same can be useful for | |
| spotting explicit "return" statements which might be superfluous, | |
| and, of course, other keywords. | |
| This tool is written in Python and does not run Scala, but the gentle | |
| user/reader might feel inspired to write the same in Scala using some | |
| AST functionality. Good luck! | |
| Since the basic functionality here is fairly simple (thanks to the | |
| excellent Pygments library) this script adds a few bells and whistles | |
| (see features below) wrapped into a "single page app", a.k.a. script. | |
| Run this script with --help to get a list of possible use-cases. | |
| Features: | |
| - show keyword tokens (default: "var") used in Scala code | |
| - read code from local files or stdin | |
| - read code from entire directories (as Unix 'find' can be hard with blanks) | |
| - accept filename shell patterns for when scanning directories | |
| - read code after cloning it from remote GitHub repositories | |
| - provide three output modes (raw, single, lines) | |
| - hightlight specified keywords in lines output mode | |
| - show line numbers in output (like grep -n) | |
| - gives an exist value 1 if any results were found (to make it CI/CD-friendly) | |
| - include a test case (?) (pick a gist of mine by default?) | |
| - silently pip-install missing dependencies | |
| - packaged as a "singe page app" a.k.a. stand-alone script | |
| - works on Python 2.7+ and 3.5+ | |
| Dependencies: | |
| - termcolor | |
| - pygments | |
| - pytest (optional, only for testing) | |
| - git (optional, only for cloning repos from GitHub) | |
| TODO: | |
| - run test functions also without py.test with some --selftest option (easy) | |
| - use --value multiple times in one go (easy) | |
| - use --basename-pattern multiple times in one go (easy) | |
| - test on a git project hosted on bitbucket.org (harder) | |
| - create a generator yielding a sequence of arbitrary tokens (here those | |
| for an entire code line) (harder) | |
| """ | |
| from __future__ import print_function | |
| import re | |
| import os | |
| import sys | |
| import shutil | |
| import fnmatch | |
| import argparse | |
| import subprocess | |
| import textwrap | |
| # pip is pre-installed on Python 2.7.9+ and Python 3.4+ | |
| try: | |
| import pip | |
| except ImportError: | |
| url = "https://pip.pypa.io/en/stable/installing/" | |
| print("Pip not found, please install it, see " + url) | |
| sys.exit(1) | |
| try: | |
| import termcolor | |
| except ImportError: | |
| try: | |
| cmd = ["install", "termcolor"] | |
| if __name__ == "__main__": | |
| cmd.insert(1, "--quiet") | |
| pip.main(cmd) | |
| import termcolor | |
| except SystemExit as e: | |
| raise | |
| try: | |
| from pygments import lex | |
| from pygments.lexers import ScalaLexer | |
| except ImportError: | |
| try: | |
| cmd = ["install", "pygments"] | |
| if __name__ == "__main__": | |
| cmd.insert(1, "--quiet") | |
| pip.main(cmd) | |
| from pygments import lex | |
| from pygments.lexers import ScalaLexer | |
| except SystemExit as e: | |
| raise | |
| __author__ = "Dinu Gherman" | |
| __version__ = "0.7" | |
| __license__ = "GPL 3" | |
| def generate_files_in_dir(root_path, patterns): | |
| """ | |
| Generate all files under root_path matching one of given patterns. | |
| """ | |
| for dir_name , subdir_list, file_list in os.walk(root_path): | |
| for fname in file_list: | |
| for pat in patterns: | |
| if fnmatch.fnmatchcase(fname, pat): | |
| full_path = os.path.join(dir_name, fname) | |
| yield full_path | |
| continue | |
| def read_code(file_object_path): | |
| """ | |
| Read code from a file object or path or stdin. | |
| """ | |
| if hasattr(file_object_path, "read"): | |
| return file_object_path.read() | |
| elif file_object_path == '-': | |
| return sys.stdin.read() | |
| elif not os.path.exists(file_object_path) and __name__ == "__main__": | |
| print("File not found: {}".format(file_object_path)) | |
| sys.exit(1) | |
| else: | |
| # raises an exception when used programmatically | |
| return open(file_object_path).read() | |
| def show_raw_tokens(path): | |
| """ | |
| Show all tokens, one by output line. | |
| This is most useful for debugging purposes. | |
| """ | |
| code = read_code(path) | |
| for name, val in lex(code, ScalaLexer()): | |
| out_line = "{} {}".format(name, repr(val)) | |
| print(out_line) | |
| def get_single_tokens(path, value="var"): | |
| """ | |
| Generate all keyword tokens with specified value in some Scala file. | |
| """ | |
| line_num = 1 | |
| code = read_code(path) | |
| for name, val in lex(code, ScalaLexer()): | |
| name = str(name) | |
| if name == "Token.Keyword": | |
| if val == value: | |
| yield (line_num, name, repr(val)) | |
| if name == "Token.Text": | |
| if re.match("^\n+$", val): | |
| line_num += val.count('\n') | |
| elif name == "Token.Comment.Single" and val.endswith('\n'): | |
| line_num += 1 | |
| elif name == "Token.Comment.Multiline" and '\n' in val: | |
| line_num += val.count('\n') | |
| def show_single_tokens(path, line_num, name, value, | |
| show_path=True, show_token_lines_numbers=False): | |
| """ | |
| Output token line number and respective token name and value. | |
| """ | |
| out_line = '' | |
| if show_path: | |
| out_line += "{}:".format(path) | |
| if show_token_lines_numbers: | |
| out_line += "{}:".format(line_num) | |
| out_line += "{} {}".format(name, value) | |
| print(out_line) | |
| def get_token_lines(path, value="var", highlight=False): | |
| """ | |
| Yield all lines containing the desired tokens in Scala code by name and/or value. | |
| """ | |
| line_num = 1 | |
| last_line = [] | |
| code = read_code(path) | |
| for name, val in lex(code, ScalaLexer()): | |
| name = str(name) | |
| last_line.append((name, val)) | |
| if '\n' in val: | |
| if name in ("Token.Text", "Token.Comment.Single", "Token.Comment.Multiline"): | |
| if last_line: | |
| if ("Token.Keyword", value) in last_line: | |
| yield (line_num, last_line) | |
| last_line = [] | |
| line_num += val.count('\n') | |
| def show_token_lines(path, value, linenum_tokens, | |
| highlight=True, show_path=True, show_token_lines_numbers=False): | |
| """ | |
| Output one line with all its tokens. | |
| """ | |
| line_num, tokens = linenum_tokens | |
| out_line = '' | |
| if show_path: | |
| out_line += '{}:'.format(path) | |
| if show_token_lines_numbers: | |
| out_line += '{}:'.format(line_num) | |
| line = '' | |
| for ca, val in tokens: | |
| cval = re.sub("\n+$", "\n", val) | |
| if val == value and highlight: | |
| cval = termcolor.colored(val, "red", attrs=["bold"]) | |
| line += cval | |
| out_line += line | |
| print(out_line.strip() + '\n', end='') | |
| def clone_github_repo(url): | |
| """ | |
| Clone remote GitHub repository. | |
| """ | |
| # extract information | |
| owner, project = None, None | |
| pat = "(https?://github.com)/(\w+)/(\w+)\.git" | |
| m = re.search(pat, url) | |
| if m: | |
| prefix, owner, project = m.groups() | |
| else: | |
| pat = "(gh):(\w+)/(\w+)" | |
| m = re.search(pat, url) | |
| if m: | |
| prefix, owner, project = m.groups() | |
| if not owner and project: | |
| print("Cannot handle {}.".format(url)) | |
| return | |
| # if we do not have git stop here | |
| if prefix not in ("gh", "http://github.com", "https://github.com"): | |
| print("Can handle only GitHub repos right now...") | |
| return | |
| else: | |
| try: | |
| res = subprocess.check_call("git --version".split()) | |
| except subprocess.CalledProcessError: | |
| print("Could not find git, ignored.") | |
| return | |
| # create directories | |
| hoster = 'github' | |
| cwd = os.getcwd() | |
| try: | |
| os.mkdir(hoster) | |
| except FileExistsError: | |
| pass | |
| os.chdir(hoster) | |
| try: | |
| if os.path.exists(owner): | |
| shutil.rmtree(owner) | |
| os.mkdir(owner) | |
| except FileExistsError: | |
| pass | |
| os.chdir(owner) | |
| # clone github repo | |
| cmd = "git clone https://github.com/{}/{}.git".format(owner, project) | |
| res = subprocess.check_call(cmd.split()) | |
| os.chdir(cwd) | |
| return os.path.join(hoster, owner, project) | |
| def process_file(path, opts): | |
| """ | |
| Process a real source file. | |
| Return the number of items found (tokens or lines). | |
| """ | |
| assert os.path.exists(path) and os.path.isfile(path) | |
| if opts.mode == "raw": | |
| show_raw_tokens(path) | |
| return 0 | |
| elif opts.mode == "tokens": | |
| i = 0 | |
| for i, (line_num, name, value) in enumerate(get_single_tokens(path, value=opts.value)): | |
| show_single_tokens(path, line_num, name, value, | |
| show_token_lines_numbers=opts.line_numbers) | |
| return i + 1 if i > 0 else 0 | |
| elif opts.mode == "lines": | |
| i = 0 | |
| for i, linenum_tokens in enumerate(get_token_lines(path, value=opts.value)): | |
| show_token_lines(path, opts.value, linenum_tokens, | |
| highlight=not opts.no_highlight, | |
| show_token_lines_numbers=opts.line_numbers) | |
| return i + 1 if i > 0 else 0 | |
| # Functions named test_* are executed by py.test | |
| def test_get_single_tokens(): | |
| """ | |
| Test get single tokens from known source. | |
| """ | |
| res = list(get_single_tokens("test.scala", value="def")) | |
| exp = [(4, 'Token.Keyword', "'def'"), | |
| (13, 'Token.Keyword', "'def'"), | |
| (20, 'Token.Keyword', "'def'")] | |
| assert res == exp | |
| def test_get_token_lines(): | |
| """ | |
| Test get single lines from known source. | |
| """ | |
| res = list(get_token_lines("test.scala", value="def")) | |
| exp = '???' | |
| assert res == exp | |
| def _main(): | |
| """ | |
| Command-line interface | |
| """ | |
| desc = "Spot single tokens (like the 'var' keyword) in Scala source code files." | |
| epilog = textwrap.dedent("""\ | |
| example usage: | |
| Show one-line description for this tool: | |
| \x1b[1m {script_name} --describe \x1b[0m | |
| Show one-line version of this tool: | |
| \x1b[1m {script_name} --version \x1b[0m | |
| Run on one file looking for 'var' keyword values (default): | |
| \x1b[1m {script_name} test.scala \x1b[0m | |
| Run on one file looking for 'trait' keyword values: | |
| \x1b[1m {script_name} --value trait test.scala \x1b[0m | |
| Run on standard input: | |
| \x1b[1m cat test.scala | {script_name} - \x1b[0m | |
| Run on files passed via Unix 'find': | |
| \x1b[1m {script_name} $(find . -name "*.scala") \x1b[0m | |
| Run on files taken from a whole directory (recursively): | |
| \x1b[1m {script_name} path/to/dir \x1b[0m | |
| Run on files from a whole directory, specifying a regex for their basenames: | |
| \x1b[1m {script_name} -p ".*\.scala" path/to/dir \x1b[0m | |
| Run on project after cloning it from GitHub: | |
| \x1b[1m {script_name} https://github.com/sbt/sbt.git \x1b[0m | |
| Run on GitHub project with shortcut: | |
| \x1b[1m {script_name} gh:sbt/sbt \x1b[0m | |
| Run on file(s), showing line numbers for output lines: | |
| \x1b[1m {script_name} -n test1.scala test2.scala \x1b[0m | |
| Run with different output modes (raw, tokens, lines) and line numbers: | |
| \x1b[1m {script_name} --mode lines -n test1.scala \x1b[0m | |
| Run self-test (on real path of this script) | |
| \x1b[1m py.test -v {script_path} \x1b[0m | |
| Copyright 2017, {author}""").format(**dict( | |
| script_name=os.path.basename(os.path.normpath(sys.argv[0])), | |
| script_path=os.path.normpath(sys.argv[0]), | |
| author=__author__) | |
| ) | |
| parser = argparse.ArgumentParser( | |
| description=desc, | |
| epilog=epilog, | |
| formatter_class=argparse.RawDescriptionHelpFormatter) | |
| parser.add_argument("path", | |
| nargs='*', | |
| help="""Either a path (of a local Scala source file or '-' for stdin | |
| or a directory (which will be recursively scanned for files matching | |
| --name-pattern), or a URL of a GitHub repository, e.g. | |
| https://github.com/sbt/sbt.git (or gh:sbt/sbt), which will be git-cloned | |
| locally and then scanned like a folder.""") | |
| parser.add_argument('--version', | |
| action='version', | |
| help="Show program's version number and exit.", | |
| version='{} {}'.format(os.path.basename(sys.argv[0]), __version__)) | |
| parser.add_argument("--describe", | |
| action="store_true", | |
| help="""Describe in one line what this tool does. Ignores everything else.""") | |
| parser.add_argument("-p", "--name-pattern", | |
| metavar="PATTERN", | |
| action='append', | |
| default=[], | |
| help="""Filename pattern (shell) to be used when running recursively | |
| on directories (default: "*.scala"). Mainly helpful, when Unix | |
| 'find' fails because of blanks.""") | |
| parser.add_argument("-v", "--value", | |
| metavar="NAME", | |
| default="var", | |
| help="Value of token (default: var).") | |
| parser.add_argument("--no-highlight", | |
| action="store_true", | |
| help='Do not highlight token values in output (only in "lines" mode).') | |
| parser.add_argument("--mode", | |
| default="lines", | |
| choices=["raw", "tokens", "lines"], | |
| help="""Output mode ("raw" to show all tokens as a raw sequence, | |
| "tokens" to show one token per output line). | |
| Or "lines" for showing entire source code line per output line | |
| (default).""") | |
| parser.add_argument("-n", "--line-numbers", | |
| action="store_true", | |
| help="""Precede each output line with its line number in the file, | |
| starting at line 1. The line number counter is reset after each | |
| processed file.""") | |
| opts = parser.parse_args() | |
| if opts.describe: | |
| print(desc) | |
| sys.exit(0) | |
| # set default filename shell pattern if not specified | |
| if not opts.name_pattern: | |
| opts.name_pattern = ["*.scala"] | |
| # iterate over input paths / dirs / URLs, counting the number of items found | |
| num_found = 0 | |
| for path in opts.path: | |
| if os.path.exists(path): | |
| if os.path.isfile(path): | |
| num_found = process_file(path, opts) | |
| elif os.path.isdir(path): | |
| num_found1 = 0 | |
| for p in generate_files_in_dir(path, opts.name_pattern): | |
| num_found1 += process_file(p, opts) | |
| num_found = num_found1 | |
| elif re.match("gh:\w+/\w+|https?://github.com/\w+/\w+\.git", path): | |
| dest_dir = clone_github_repo(path) | |
| num_found1 = 0 | |
| for p in generate_files_in_dir(dest_dir, opts.name_pattern): | |
| num_found1 += process_file(p, opts) | |
| num_found = num_found1 | |
| else: | |
| raise IOError("Path argument '{}' not found".format(path)) | |
| sys.exit(1 if num_found > 0 else 0) | |
| if __name__ == "__main__": | |
| _main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment