wmvanvliet · August 6, 2016 17:45 · jasmainak · Aug 4, 2016
diff --git a/checker.py b/checker.py
 from __future__ import print_function

 import os.path as op
 import inspect
 import distance
 import numpy as np
 from scipy.misc import comb
 import progressbar as pb
 import progressbar.widgets as pw

 import mne

 # Extract docstrings from the python files
 docstrings = dict()

 def add_docstring(prefix, f):
    """Extract docstring from a function node and append to the giant list"""
    docstring = inspect.getdoc(f)
    if docstring is not None:
        docstrings[prefix] = docstring
    else:
        docstrings[prefix] = 'NONE'

 def is_interesting(node):
    return (
        inspect.isfunction(node) or
        inspect.isgeneratorfunction(node) or
        inspect.ismethod(node) or
        inspect.isclass(node) or
        inspect.ismodule(node)
    )

 parsed_nodes = set()
 def parse(name, node):
    """Recursively obtain docstrings for all functions and methods in the module"""
    try:
        nodes = inspect.getmembers(node, is_interesting)
        f = inspect.getfile(node)
    except:
        return

    if not f.startswith(op.dirname(mne.__file__)):
        return
    if f +':'+ name in parsed_nodes:
        return
    parsed_nodes.add(f +':'+ name)

    for name_, node in nodes:
        try:
            if not inspect.getfile(node).startswith(op.dirname(mne.__file__)):
                continue
        except:
            continue
        if inspect.isfunction(node) or inspect.isgeneratorfunction(node):
            add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
        elif inspect.ismethod(node):
            add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node)
        elif inspect.isclass(node) or inspect.ismodule(node):
            add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
            parse(name_, node)

 parse('mne', mne)

 # Make a sorted list of docstrings
 docstrings = list(docstrings.items())
 docstrings.sort(key=lambda x: x[0])

 # Save all the extracted docstrings
 with open('docstrings.txt', 'w') as f:
    f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings))

 #for prefix, docstring in docstrings:
 #    print prefix

 # Remove whitespace from docstrings
 docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings]

 # Filter the docstrings by length
 docstrings = [d for d in docstrings if len(d[1]) > 100]

 # Construct a nice progress bar
 pbar = pb.ProgressBar(
    maxval = comb(len(docstrings), 2),
    widgets = [
        pw.Percentage(),
        '|',
        pw.ETA(),
        '|',
        'current function',
    ],
 )

 # Compare all long docstrings (takes time...)
 scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float)
 pbar = pbar.start()
 for i, d1 in enumerate(docstrings[:-1]):
    p1, d1 = d1
    pbar.widgets[-1] = p1  # Note current function in the progress bar
    for j, d2 in enumerate(docstrings[i + 1:], i + 1):
        p2, d2 = d2
        scores[i, j] = distance.levenshtein(d1, d2, max_dist=500)
        if scores[i, j] == -1:
            scores[i, j] = 1.0
        else:
            scores[i, j] /= min(max(len(d1), len(d2)), 500)
        pbar.update(pbar.currval + 1)
 pbar.finish()

 # Find candicate duplicate docstrings
 similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25))))

 # Print them
 for i1, i2 in similar_docstrings:
    print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0])
	from __future__ import print_function

	import os.path as op
	import inspect
	import distance
	import numpy as np
	from scipy.misc import comb
	import progressbar as pb
	import progressbar.widgets as pw

	import mne

	# Extract docstrings from the python files
	docstrings = dict()

	def add_docstring(prefix, f):
	"""Extract docstring from a function node and append to the giant list"""
	docstring = inspect.getdoc(f)
	if docstring is not None:
	docstrings[prefix] = docstring
	else:
	docstrings[prefix] = 'NONE'

	def is_interesting(node):
	return (
	inspect.isfunction(node) or
	inspect.isgeneratorfunction(node) or
	inspect.ismethod(node) or
	inspect.isclass(node) or
	inspect.ismodule(node)
	)

	parsed_nodes = set()
	def parse(name, node):
	"""Recursively obtain docstrings for all functions and methods in the module"""
	try:
	nodes = inspect.getmembers(node, is_interesting)
	f = inspect.getfile(node)
	except:
	return

	if not f.startswith(op.dirname(mne.__file__)):
	return
	if f +':'+ name in parsed_nodes:
	return
	parsed_nodes.add(f +':'+ name)

	for name_, node in nodes:
	try:
	if not inspect.getfile(node).startswith(op.dirname(mne.__file__)):
	continue
	except:
	continue
	if inspect.isfunction(node) or inspect.isgeneratorfunction(node):
	add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
	elif inspect.ismethod(node):
	add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node)
	elif inspect.isclass(node) or inspect.ismodule(node):
	add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
	parse(name_, node)

	parse('mne', mne)

	# Make a sorted list of docstrings
	docstrings = list(docstrings.items())
	docstrings.sort(key=lambda x: x[0])

	# Save all the extracted docstrings
	with open('docstrings.txt', 'w') as f:
	f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings))

	#for prefix, docstring in docstrings:
	# print prefix

	# Remove whitespace from docstrings
	docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings]

	# Filter the docstrings by length
	docstrings = [d for d in docstrings if len(d[1]) > 100]

	# Construct a nice progress bar
	pbar = pb.ProgressBar(
	maxval = comb(len(docstrings), 2),
	widgets = [
	pw.Percentage(),
	'\|',
	pw.ETA(),
	'\|',
	'current function',
	],
	)

	# Compare all long docstrings (takes time...)
	scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float)
	pbar = pbar.start()
	for i, d1 in enumerate(docstrings[:-1]):
	p1, d1 = d1
	pbar.widgets[-1] = p1 # Note current function in the progress bar
	for j, d2 in enumerate(docstrings[i + 1:], i + 1):
	p2, d2 = d2
	scores[i, j] = distance.levenshtein(d1, d2, max_dist=500)
	if scores[i, j] == -1:
	scores[i, j] = 1.0
	else:
	scores[i, j] /= min(max(len(d1), len(d2)), 500)
	pbar.update(pbar.currval + 1)
	pbar.finish()

	# Find candicate duplicate docstrings
	similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25))))

	# Print them
	for i1, i2 in similar_docstrings:
	print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0])