Last active
August 6, 2016 17:45
-
-
Save wmvanvliet/de8d322773cd79848e486d56d6590835 to your computer and use it in GitHub Desktop.
Script that extracts all docstrings from a Python module, saves them to a file, and proceeds to compile a list of possible duplicates based on levenshtein distance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import os.path as op | |
import inspect | |
import distance | |
import numpy as np | |
from scipy.misc import comb | |
import progressbar as pb | |
import progressbar.widgets as pw | |
import mne | |
# Extract docstrings from the python files | |
docstrings = dict() | |
def add_docstring(prefix, f): | |
"""Extract docstring from a function node and append to the giant list""" | |
docstring = inspect.getdoc(f) | |
if docstring is not None: | |
docstrings[prefix] = docstring | |
else: | |
docstrings[prefix] = 'NONE' | |
def is_interesting(node): | |
return ( | |
inspect.isfunction(node) or | |
inspect.isgeneratorfunction(node) or | |
inspect.ismethod(node) or | |
inspect.isclass(node) or | |
inspect.ismodule(node) | |
) | |
parsed_nodes = set() | |
def parse(name, node): | |
"""Recursively obtain docstrings for all functions and methods in the module""" | |
try: | |
nodes = inspect.getmembers(node, is_interesting) | |
f = inspect.getfile(node) | |
except: | |
return | |
if not f.startswith(op.dirname(mne.__file__)): | |
return | |
if f +':'+ name in parsed_nodes: | |
return | |
parsed_nodes.add(f +':'+ name) | |
for name_, node in nodes: | |
try: | |
if not inspect.getfile(node).startswith(op.dirname(mne.__file__)): | |
continue | |
except: | |
continue | |
if inspect.isfunction(node) or inspect.isgeneratorfunction(node): | |
add_docstring(name_ +' ('+ inspect.getfile(node) +')', node) | |
elif inspect.ismethod(node): | |
add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node) | |
elif inspect.isclass(node) or inspect.ismodule(node): | |
add_docstring(name_ +' ('+ inspect.getfile(node) +')', node) | |
parse(name_, node) | |
parse('mne', mne) | |
# Make a sorted list of docstrings | |
docstrings = list(docstrings.items()) | |
docstrings.sort(key=lambda x: x[0]) | |
# Save all the extracted docstrings | |
with open('docstrings.txt', 'w') as f: | |
f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings)) | |
#for prefix, docstring in docstrings: | |
# print prefix | |
# Remove whitespace from docstrings | |
docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings] | |
# Filter the docstrings by length | |
docstrings = [d for d in docstrings if len(d[1]) > 100] | |
# Construct a nice progress bar | |
pbar = pb.ProgressBar( | |
maxval = comb(len(docstrings), 2), | |
widgets = [ | |
pw.Percentage(), | |
'|', | |
pw.ETA(), | |
'|', | |
'current function', | |
], | |
) | |
# Compare all long docstrings (takes time...) | |
scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float) | |
pbar = pbar.start() | |
for i, d1 in enumerate(docstrings[:-1]): | |
p1, d1 = d1 | |
pbar.widgets[-1] = p1 # Note current function in the progress bar | |
for j, d2 in enumerate(docstrings[i + 1:], i + 1): | |
p2, d2 = d2 | |
scores[i, j] = distance.levenshtein(d1, d2, max_dist=500) | |
if scores[i, j] == -1: | |
scores[i, j] = 1.0 | |
else: | |
scores[i, j] /= min(max(len(d1), len(d2)), 500) | |
pbar.update(pbar.currval + 1) | |
pbar.finish() | |
# Find candicate duplicate docstrings | |
similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25)))) | |
# Print them | |
for i1, i2 in similar_docstrings: | |
print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool script, thanks for sharing :)