Created
January 8, 2012 22:34
-
-
Save wesm/1579958 to your computer and use it in GitHub Desktop.
Count all function lengths under a directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame | |
from pandas.util.testing import set_trace | |
import os | |
import numpy as np | |
import matplotlib.pyplot as plt | |
dirs = [] | |
names = [] | |
lengths = [] | |
walked = os.walk('pandas') | |
def _should_count_file(path): | |
return path.endswith('.py') or path.endswith('.pyx') | |
def _is_def_line(line): | |
return (line.endswith(':') and | |
(line.startswith('def ') or | |
line.startswith('cdef ') or | |
line.startswith('cpdef ') or | |
' def ' in line or ' cdef ' in line or ' cpdef ' in line)) | |
class LengthCounter(object): | |
""" | |
should add option for subtracting nested function lengths?? | |
""" | |
def __init__(self, lines): | |
self.lines = lines | |
self.pos = 0 | |
self.counts = [] | |
self.n = len(lines) | |
def get_counts(self): | |
self.pos = 0 | |
self.counts = [] | |
while self.pos < self.n: | |
line = self.lines[self.pos] | |
self.pos += 1 | |
if _is_def_line(line): | |
level = _get_indent_level(line) | |
self._count_function(indent_level=level) | |
return self.counts | |
def _count_function(self, indent_level=1): | |
indent = ' ' * indent_level | |
def _end_of_function(line): | |
return (line != '' and | |
not line.startswith(indent) and | |
not line.startswith('#')) | |
start_pos = self.pos | |
while self.pos < self.n: | |
line = self.lines[self.pos] | |
if _end_of_function(line): | |
self._push_count(start_pos) | |
return | |
self.pos += 1 | |
if _is_def_line(line): | |
self._count_function(indent_level=indent_level + 1) | |
# end of file | |
self._push_count(start_pos) | |
def _push_count(self, start_pos): | |
func_lines = self.lines[start_pos:self.pos] | |
if len(func_lines) > 300: | |
set_trace() | |
# remove blank lines at end | |
while len(func_lines) > 0 and func_lines[-1] == '': | |
func_lines = func_lines[:-1] | |
# remove docstrings and comments | |
clean_lines = [] | |
in_docstring = False | |
for line in func_lines: | |
line = line.strip() | |
if in_docstring and _is_triplequote(line): | |
in_docstring = False | |
continue | |
if line.startswith('#'): | |
continue | |
if _is_triplequote(line): | |
in_docstring = True | |
continue | |
self.counts.append(len(func_lines)) | |
def _get_indent_level(line): | |
level = 0 | |
while line.startswith(' ' * level): | |
level += 1 | |
return level | |
def _is_triplequote(line): | |
return line.startswith('"""') or line.startswith("'''") | |
def _get_file_function_lengths(path): | |
lines = [x.rstrip() for x in open(path).readlines()] | |
counter = LengthCounter(lines) | |
return counter.get_counts() | |
# def test_get_function_lengths(): | |
text = """ | |
class Foo: | |
def foo(): | |
def bar(): | |
a = 1 | |
b = 2 | |
c = 3 | |
foo = 'bar' | |
def x(): | |
a = 1 | |
b = 3 | |
c = 7 | |
pass | |
""" | |
expected = [5, 8, 7] | |
lines = [x.rstrip() for x in text.splitlines()] | |
counter = LengthCounter(lines) | |
result = counter.get_counts() | |
assert(result == expected) | |
def doit(): | |
for directory, _, files in walked: | |
print directory | |
for path in files: | |
if not _should_count_file(path): | |
continue | |
full_path = os.path.join(directory, path) | |
print full_path | |
lines = len(open(full_path).readlines()) | |
dirs.append(directory) | |
names.append(path) | |
lengths.append(lines) | |
result = DataFrame({'dirs' : dirs, 'names' : names, | |
'lengths' : lengths}) | |
def doit2(): | |
counts = {} | |
for directory, _, files in walked: | |
print directory | |
for path in files: | |
if not _should_count_file(path) or path.startswith('test_'): | |
continue | |
full_path = os.path.join(directory, path) | |
counts[full_path] = _get_file_function_lengths(full_path) | |
return counts | |
counts = doit2() | |
# counts = _get_file_function_lengths('pandas/tests/test_series.py') | |
all_counts = [] | |
for k, v in counts.iteritems(): | |
all_counts.extend(v) | |
all_counts = np.array(all_counts) | |
fig = plt.figure(figsize=(10, 5)) | |
ax = fig.add_subplot(111) | |
ax.hist(all_counts, bins=100) | |
n = len(all_counts) | |
nmore = (all_counts > 50).sum() | |
ax.set_title('pandas function lengths, n=%d' % n) | |
ax.set_ylabel('N functions') | |
ax.set_xlabel('Function length') | |
ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)), | |
fontsize=18) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment