Last active
May 28, 2018 16:58
-
-
Save hamelsmu/7439b131dd9b37d246cdab71b5c898ac to your computer and use it in GitHub Desktop.
Get code and comment pairs - for tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize_docstring(text): | |
"""Apply tokenization using spacy to docstrings.""" | |
tokens = EN.tokenizer(text) | |
return [token.text.lower() for token in tokens if not token.is_space] | |
def tokenize_code(text): | |
"""A very basic procedure for tokenizing code strings.""" | |
return RegexpTokenizer(r'\w+').tokenize(text) | |
def get_function_docstring_pairs(blob): | |
"""Extract (function/method, docstring) pairs from a given code blob.""" | |
pairs = [] | |
try: | |
module = ast.parse(blob) | |
classes = [node for node in module.body if isinstance(node, ast.ClassDef)] | |
functions = [node for node in module.body if isinstance(node, ast.FunctionDef)] | |
for _class in classes: | |
functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)]) | |
for f in functions: | |
source = astor.to_source(f) | |
docstring = ast.get_docstring(f) if ast.get_docstring(f) else '' | |
function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source | |
pairs.append((f.name, | |
f.lineno, | |
source, | |
' '.join(tokenize_code(function)), | |
' '.join(tokenize_docstring(docstring.split('\n\n')[0])) | |
)) | |
except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError): | |
pass | |
return pairs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment