Skip to content

Instantly share code, notes, and snippets.

@deeplook
Created September 28, 2017 09:22
Show Gist options
  • Select an option

  • Save deeplook/23f666e73ffe9c41ba91ae4fbbf4d094 to your computer and use it in GitHub Desktop.

Select an option

Save deeplook/23f666e73ffe9c41ba91ae4fbbf4d094 to your computer and use it in GitHub Desktop.
Spot single tokens (like the 'var' keyword) in Scala source code files.
#!/usr/bin/env python
"""
List keyword tokens in Scala source code.
Originally intended for spotting lines containing "var" declarations
in Scala source files (hence the name "scala varning") which are
somewhat considered "harmful" in Scala. The same can be useful for
spotting explicit "return" statements which might be superfluous,
and, of course, other keywords.
This tool is written in Python and does not run Scala, but the gentle
user/reader might feel inspired to write the same in Scala using some
AST functionality. Good luck!
Since the basic functionality here is fairly simple (thanks to the
excellent Pygments library) this script adds a few bells and whistles
(see features below) wrapped into a "single page app", a.k.a. script.
Run this script with --help to get a list of possible use-cases.
Features:
- show keyword tokens (default: "var") used in Scala code
- read code from local files or stdin
- read code from entire directories (as Unix 'find' can be hard with blanks)
- accept filename shell patterns for when scanning directories
- read code after cloning it from remote GitHub repositories
- provide three output modes (raw, single, lines)
- hightlight specified keywords in lines output mode
- show line numbers in output (like grep -n)
- gives an exist value 1 if any results were found (to make it CI/CD-friendly)
- include a test case (?) (pick a gist of mine by default?)
- silently pip-install missing dependencies
- packaged as a "singe page app" a.k.a. stand-alone script
- works on Python 2.7+ and 3.5+
Dependencies:
- termcolor
- pygments
- pytest (optional, only for testing)
- git (optional, only for cloning repos from GitHub)
TODO:
- run test functions also without py.test with some --selftest option (easy)
- use --value multiple times in one go (easy)
- use --basename-pattern multiple times in one go (easy)
- test on a git project hosted on bitbucket.org (harder)
- create a generator yielding a sequence of arbitrary tokens (here those
for an entire code line) (harder)
"""
from __future__ import print_function
import re
import os
import sys
import shutil
import fnmatch
import argparse
import subprocess
import textwrap
# pip is pre-installed on Python 2.7.9+ and Python 3.4+
try:
import pip
except ImportError:
url = "https://pip.pypa.io/en/stable/installing/"
print("Pip not found, please install it, see " + url)
sys.exit(1)
try:
import termcolor
except ImportError:
try:
cmd = ["install", "termcolor"]
if __name__ == "__main__":
cmd.insert(1, "--quiet")
pip.main(cmd)
import termcolor
except SystemExit as e:
raise
try:
from pygments import lex
from pygments.lexers import ScalaLexer
except ImportError:
try:
cmd = ["install", "pygments"]
if __name__ == "__main__":
cmd.insert(1, "--quiet")
pip.main(cmd)
from pygments import lex
from pygments.lexers import ScalaLexer
except SystemExit as e:
raise
__author__ = "Dinu Gherman"
__version__ = "0.7"
__license__ = "GPL 3"
def generate_files_in_dir(root_path, patterns):
"""
Generate all files under root_path matching one of given patterns.
"""
for dir_name , subdir_list, file_list in os.walk(root_path):
for fname in file_list:
for pat in patterns:
if fnmatch.fnmatchcase(fname, pat):
full_path = os.path.join(dir_name, fname)
yield full_path
continue
def read_code(file_object_path):
"""
Read code from a file object or path or stdin.
"""
if hasattr(file_object_path, "read"):
return file_object_path.read()
elif file_object_path == '-':
return sys.stdin.read()
elif not os.path.exists(file_object_path) and __name__ == "__main__":
print("File not found: {}".format(file_object_path))
sys.exit(1)
else:
# raises an exception when used programmatically
return open(file_object_path).read()
def show_raw_tokens(path):
"""
Show all tokens, one by output line.
This is most useful for debugging purposes.
"""
code = read_code(path)
for name, val in lex(code, ScalaLexer()):
out_line = "{} {}".format(name, repr(val))
print(out_line)
def get_single_tokens(path, value="var"):
"""
Generate all keyword tokens with specified value in some Scala file.
"""
line_num = 1
code = read_code(path)
for name, val in lex(code, ScalaLexer()):
name = str(name)
if name == "Token.Keyword":
if val == value:
yield (line_num, name, repr(val))
if name == "Token.Text":
if re.match("^\n+$", val):
line_num += val.count('\n')
elif name == "Token.Comment.Single" and val.endswith('\n'):
line_num += 1
elif name == "Token.Comment.Multiline" and '\n' in val:
line_num += val.count('\n')
def show_single_tokens(path, line_num, name, value,
show_path=True, show_token_lines_numbers=False):
"""
Output token line number and respective token name and value.
"""
out_line = ''
if show_path:
out_line += "{}:".format(path)
if show_token_lines_numbers:
out_line += "{}:".format(line_num)
out_line += "{} {}".format(name, value)
print(out_line)
def get_token_lines(path, value="var", highlight=False):
"""
Yield all lines containing the desired tokens in Scala code by name and/or value.
"""
line_num = 1
last_line = []
code = read_code(path)
for name, val in lex(code, ScalaLexer()):
name = str(name)
last_line.append((name, val))
if '\n' in val:
if name in ("Token.Text", "Token.Comment.Single", "Token.Comment.Multiline"):
if last_line:
if ("Token.Keyword", value) in last_line:
yield (line_num, last_line)
last_line = []
line_num += val.count('\n')
def show_token_lines(path, value, linenum_tokens,
highlight=True, show_path=True, show_token_lines_numbers=False):
"""
Output one line with all its tokens.
"""
line_num, tokens = linenum_tokens
out_line = ''
if show_path:
out_line += '{}:'.format(path)
if show_token_lines_numbers:
out_line += '{}:'.format(line_num)
line = ''
for ca, val in tokens:
cval = re.sub("\n+$", "\n", val)
if val == value and highlight:
cval = termcolor.colored(val, "red", attrs=["bold"])
line += cval
out_line += line
print(out_line.strip() + '\n', end='')
def clone_github_repo(url):
"""
Clone remote GitHub repository.
"""
# extract information
owner, project = None, None
pat = "(https?://github.com)/(\w+)/(\w+)\.git"
m = re.search(pat, url)
if m:
prefix, owner, project = m.groups()
else:
pat = "(gh):(\w+)/(\w+)"
m = re.search(pat, url)
if m:
prefix, owner, project = m.groups()
if not owner and project:
print("Cannot handle {}.".format(url))
return
# if we do not have git stop here
if prefix not in ("gh", "http://github.com", "https://github.com"):
print("Can handle only GitHub repos right now...")
return
else:
try:
res = subprocess.check_call("git --version".split())
except subprocess.CalledProcessError:
print("Could not find git, ignored.")
return
# create directories
hoster = 'github'
cwd = os.getcwd()
try:
os.mkdir(hoster)
except FileExistsError:
pass
os.chdir(hoster)
try:
if os.path.exists(owner):
shutil.rmtree(owner)
os.mkdir(owner)
except FileExistsError:
pass
os.chdir(owner)
# clone github repo
cmd = "git clone https://github.com/{}/{}.git".format(owner, project)
res = subprocess.check_call(cmd.split())
os.chdir(cwd)
return os.path.join(hoster, owner, project)
def process_file(path, opts):
"""
Process a real source file.
Return the number of items found (tokens or lines).
"""
assert os.path.exists(path) and os.path.isfile(path)
if opts.mode == "raw":
show_raw_tokens(path)
return 0
elif opts.mode == "tokens":
i = 0
for i, (line_num, name, value) in enumerate(get_single_tokens(path, value=opts.value)):
show_single_tokens(path, line_num, name, value,
show_token_lines_numbers=opts.line_numbers)
return i + 1 if i > 0 else 0
elif opts.mode == "lines":
i = 0
for i, linenum_tokens in enumerate(get_token_lines(path, value=opts.value)):
show_token_lines(path, opts.value, linenum_tokens,
highlight=not opts.no_highlight,
show_token_lines_numbers=opts.line_numbers)
return i + 1 if i > 0 else 0
# Functions named test_* are executed by py.test
def test_get_single_tokens():
"""
Test get single tokens from known source.
"""
res = list(get_single_tokens("test.scala", value="def"))
exp = [(4, 'Token.Keyword', "'def'"),
(13, 'Token.Keyword', "'def'"),
(20, 'Token.Keyword', "'def'")]
assert res == exp
def test_get_token_lines():
"""
Test get single lines from known source.
"""
res = list(get_token_lines("test.scala", value="def"))
exp = '???'
assert res == exp
def _main():
"""
Command-line interface
"""
desc = "Spot single tokens (like the 'var' keyword) in Scala source code files."
epilog = textwrap.dedent("""\
example usage:
Show one-line description for this tool:
\x1b[1m {script_name} --describe \x1b[0m
Show one-line version of this tool:
\x1b[1m {script_name} --version \x1b[0m
Run on one file looking for 'var' keyword values (default):
\x1b[1m {script_name} test.scala \x1b[0m
Run on one file looking for 'trait' keyword values:
\x1b[1m {script_name} --value trait test.scala \x1b[0m
Run on standard input:
\x1b[1m cat test.scala | {script_name} - \x1b[0m
Run on files passed via Unix 'find':
\x1b[1m {script_name} $(find . -name "*.scala") \x1b[0m
Run on files taken from a whole directory (recursively):
\x1b[1m {script_name} path/to/dir \x1b[0m
Run on files from a whole directory, specifying a regex for their basenames:
\x1b[1m {script_name} -p ".*\.scala" path/to/dir \x1b[0m
Run on project after cloning it from GitHub:
\x1b[1m {script_name} https://github.com/sbt/sbt.git \x1b[0m
Run on GitHub project with shortcut:
\x1b[1m {script_name} gh:sbt/sbt \x1b[0m
Run on file(s), showing line numbers for output lines:
\x1b[1m {script_name} -n test1.scala test2.scala \x1b[0m
Run with different output modes (raw, tokens, lines) and line numbers:
\x1b[1m {script_name} --mode lines -n test1.scala \x1b[0m
Run self-test (on real path of this script)
\x1b[1m py.test -v {script_path} \x1b[0m
Copyright 2017, {author}""").format(**dict(
script_name=os.path.basename(os.path.normpath(sys.argv[0])),
script_path=os.path.normpath(sys.argv[0]),
author=__author__)
)
parser = argparse.ArgumentParser(
description=desc,
epilog=epilog,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("path",
nargs='*',
help="""Either a path (of a local Scala source file or '-' for stdin
or a directory (which will be recursively scanned for files matching
--name-pattern), or a URL of a GitHub repository, e.g.
https://github.com/sbt/sbt.git (or gh:sbt/sbt), which will be git-cloned
locally and then scanned like a folder.""")
parser.add_argument('--version',
action='version',
help="Show program's version number and exit.",
version='{} {}'.format(os.path.basename(sys.argv[0]), __version__))
parser.add_argument("--describe",
action="store_true",
help="""Describe in one line what this tool does. Ignores everything else.""")
parser.add_argument("-p", "--name-pattern",
metavar="PATTERN",
action='append',
default=[],
help="""Filename pattern (shell) to be used when running recursively
on directories (default: "*.scala"). Mainly helpful, when Unix
'find' fails because of blanks.""")
parser.add_argument("-v", "--value",
metavar="NAME",
default="var",
help="Value of token (default: var).")
parser.add_argument("--no-highlight",
action="store_true",
help='Do not highlight token values in output (only in "lines" mode).')
parser.add_argument("--mode",
default="lines",
choices=["raw", "tokens", "lines"],
help="""Output mode ("raw" to show all tokens as a raw sequence,
"tokens" to show one token per output line).
Or "lines" for showing entire source code line per output line
(default).""")
parser.add_argument("-n", "--line-numbers",
action="store_true",
help="""Precede each output line with its line number in the file,
starting at line 1. The line number counter is reset after each
processed file.""")
opts = parser.parse_args()
if opts.describe:
print(desc)
sys.exit(0)
# set default filename shell pattern if not specified
if not opts.name_pattern:
opts.name_pattern = ["*.scala"]
# iterate over input paths / dirs / URLs, counting the number of items found
num_found = 0
for path in opts.path:
if os.path.exists(path):
if os.path.isfile(path):
num_found = process_file(path, opts)
elif os.path.isdir(path):
num_found1 = 0
for p in generate_files_in_dir(path, opts.name_pattern):
num_found1 += process_file(p, opts)
num_found = num_found1
elif re.match("gh:\w+/\w+|https?://github.com/\w+/\w+\.git", path):
dest_dir = clone_github_repo(path)
num_found1 = 0
for p in generate_files_in_dir(dest_dir, opts.name_pattern):
num_found1 += process_file(p, opts)
num_found = num_found1
else:
raise IOError("Path argument '{}' not found".format(path))
sys.exit(1 if num_found > 0 else 0)
if __name__ == "__main__":
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment