-
-
Save minrk/1580880 to your computer and use it in GitHub Desktop.
Count all function lengths under a directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from pandas import DataFrame | |
from pandas.util.testing import set_trace | |
dirs = [] | |
names = [] | |
lengths = [] | |
if len(sys.argv) > 1: | |
loc = sys.argv[1] | |
else: | |
loc = '.' | |
walked = os.walk(loc) | |
def _should_count_file(path): | |
return path.endswith('.py') or path.endswith('.pyx') | |
def _is_def_line(line): | |
"""def/cdef/cpdef, but not `cdef class`""" | |
return (line.endswith(':') and not 'class' in line.split() and | |
(line.startswith('def ') or | |
line.startswith('cdef ') or | |
line.startswith('cpdef ') or | |
' def ' in line or ' cdef ' in line or ' cpdef ' in line)) | |
class LengthCounter(object): | |
""" | |
should add option for subtracting nested function lengths?? | |
""" | |
def __init__(self, lines): | |
self.lines = lines | |
self.pos = 0 | |
self.counts = [] | |
self.n = len(lines) | |
def get_counts(self): | |
self.pos = 0 | |
self.counts = [] | |
while self.pos < self.n: | |
line = self.lines[self.pos] | |
self.pos += 1 | |
if _is_def_line(line): | |
level = _get_indent_level(line) | |
self._count_function(indent_level=level) | |
return self.counts | |
def _count_function(self, indent_level=1): | |
indent = ' ' * indent_level | |
def _end_of_function(line): | |
return (line != '' and | |
not line.startswith(indent) and | |
not line.startswith('#')) | |
start_pos = self.pos | |
while self.pos < self.n: | |
line = self.lines[self.pos] | |
if _end_of_function(line): | |
self._push_count(start_pos) | |
return | |
self.pos += 1 | |
if _is_def_line(line): | |
self._count_function(indent_level=indent_level + 1) | |
# end of file | |
self._push_count(start_pos) | |
def _push_count(self, start_pos): | |
func_lines = self.lines[start_pos:self.pos] | |
if len(func_lines) > 300: | |
set_trace() | |
# remove blank lines at end | |
while len(func_lines) > 0 and func_lines[-1] == '': | |
func_lines = func_lines[:-1] | |
# remove docstrings and comments | |
clean_lines = [] | |
in_docstring = False | |
for line in func_lines: | |
line = line.strip() | |
if in_docstring and _is_triplequote(line): | |
in_docstring = False | |
continue | |
if line.startswith('#'): | |
continue | |
if _is_triplequote(line): | |
in_docstring = True | |
continue | |
self.counts.append(len(func_lines)) | |
def _get_indent_level(line): | |
level = 0 | |
while line.startswith(' ' * level): | |
level += 1 | |
return level | |
def _is_triplequote(line): | |
return line.startswith('"""') or line.startswith("'''") | |
def _get_file_function_lengths(path): | |
lines = [x.rstrip() for x in open(path).readlines()] | |
counter = LengthCounter(lines) | |
return counter.get_counts() | |
# def test_get_function_lengths(): | |
text = """ | |
class Foo: | |
def foo(): | |
def bar(): | |
a = 1 | |
b = 2 | |
c = 3 | |
foo = 'bar' | |
def x(): | |
a = 1 | |
b = 3 | |
c = 7 | |
pass | |
""" | |
expected = [5, 8, 7] | |
lines = [x.rstrip() for x in text.splitlines()] | |
counter = LengthCounter(lines) | |
result = counter.get_counts() | |
assert(result == expected) | |
def doit(): | |
for directory, _, files in walked: | |
print directory | |
for path in files: | |
if not _should_count_file(path): | |
continue | |
full_path = os.path.join(directory, path) | |
print full_path | |
lines = len(open(full_path).readlines()) | |
dirs.append(directory) | |
names.append(path) | |
lengths.append(lines) | |
result = DataFrame({'dirs' : dirs, 'names' : names, | |
'lengths' : lengths}) | |
def doit2(): | |
counts = {} | |
for directory, _, files in walked: | |
print directory | |
for path in files: | |
if not _should_count_file(path) or path.startswith('test_'): | |
continue | |
full_path = os.path.join(directory, path) | |
counts[full_path] = _get_file_function_lengths(full_path) | |
return counts | |
counts = doit2() | |
# counts = _get_file_function_lengths('pandas/tests/test_series.py') | |
all_counts = [] | |
for k, v in counts.iteritems(): | |
all_counts.extend(v) | |
all_counts = np.array(all_counts) | |
fig = plt.figure(figsize=(10, 5)) | |
ax = fig.add_subplot(111) | |
ax.hist(all_counts, bins=100) | |
n = len(all_counts) | |
nmore = (all_counts > 50).sum() | |
ax.set_title('%s function lengths, n=%d' % (loc, n)) | |
ax.set_ylabel('N functions') | |
ax.set_xlabel('Function length') | |
ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)), | |
fontsize=18) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment