Last active
May 31, 2021 12:22
-
-
Save salt-die/8ae39f4914ef1d603f3705a43941557c to your computer and use it in GitHub Desktop.
Measuring AST similarity for plagiarism detection.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ast import iter_child_nodes, parse, Load, Store, Delete | |
from igraph import Graph | |
class EnumDict(dict): | |
def __init__(self): | |
self.types = [ ] | |
def __missing__(self, key): | |
self[key] = len(self.types) | |
self.types.append(key) | |
return self[key] | |
TYPE_ENUM = EnumDict() # AST node type -> int | |
CONTEXTS = Load, Store, Delete | |
def sources_to_igraph(*sources, skip_contexts=True): | |
for source in sources: | |
graph = Graph(directed=True) | |
tree = parse(source) | |
_ast_visit(graph, tree, skip_contexts) | |
yield graph | |
def _ast_visit(graph, node, skip_contexts): | |
vertex = graph.add_vertex(label=TYPE_ENUM[type(node).__name__]) | |
for child in iter_child_nodes(node): | |
if skip_contexts and isinstance(child, CONTEXTS): | |
continue | |
descendent = _ast_visit(graph, child, skip_contexts) | |
graph.add_edge(vertex, descendent) | |
return vertex | |
if __name__ == '__main__': | |
# Plotting AST | |
from igraph import plot | |
g = next(sources_to_igraph('for i in range(5): print(i)')) | |
plot(g, target='astgraph.png', vertex_label=[TYPE_ENUM.types[i] for i in g.vs['label']], vertex_label_dist=2, margin=40) | |
# Kernels | |
from graphkernels.kernels import CalculateWLKernel as WLKernel | |
entry_1 = 'for i in range(5): print(i)' | |
entry_2 = 'for k in range(20): print(k + "!")' | |
entry_3 = 'for i, k in enumerate(range(20)): print(i, k)' | |
graphs = list( sources_to_igraph(entry_1, entry_2, entry_3) ) | |
k = WLKernel(graphs) | |
print(k) | |
# [[ 88. 50. 56.] | |
# [ 50. 96. 52.] | |
# [ 56. 52. 192.]] |
Author
salt-die
commented
May 31, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment