Skip to content

Instantly share code, notes, and snippets.

@hamelsmu
Last active May 28, 2018 16:58
Show Gist options
  • Save hamelsmu/7439b131dd9b37d246cdab71b5c898ac to your computer and use it in GitHub Desktop.
Save hamelsmu/7439b131dd9b37d246cdab71b5c898ac to your computer and use it in GitHub Desktop.
Get code and comment pairs - for tutorial
def tokenize_docstring(text):
"""Apply tokenization using spacy to docstrings."""
tokens = EN.tokenizer(text)
return [token.text.lower() for token in tokens if not token.is_space]
def tokenize_code(text):
"""A very basic procedure for tokenizing code strings."""
return RegexpTokenizer(r'\w+').tokenize(text)
def get_function_docstring_pairs(blob):
"""Extract (function/method, docstring) pairs from a given code blob."""
pairs = []
try:
module = ast.parse(blob)
classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
for _class in classes:
functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])
for f in functions:
source = astor.to_source(f)
docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source
pairs.append((f.name,
f.lineno,
source,
' '.join(tokenize_code(function)),
' '.join(tokenize_docstring(docstring.split('\n\n')[0]))
))
except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
pass
return pairs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment