Created
May 26, 2014 11:18
-
-
Save math314/8590ec95bfb979067570 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #coding=utf-8 | |
| import sys,os | |
| import re | |
| from collections import defaultdict | |
| from itertools import * | |
| def scc(edges,redges): | |
| def dfs(v): | |
| used.add(v) | |
| if v in edges: | |
| for t in edges[v]: | |
| if t not in used: | |
| dfs(t) | |
| d1.append(v) | |
| def rdfs(v,l): | |
| used.add(v) | |
| l.append(v) | |
| if v in redges: | |
| for t in redges[v]: | |
| if t not in used: | |
| rdfs(t,l) | |
| used = set() | |
| d1 = [] | |
| for v in edges: | |
| if v not in used: | |
| dfs(v) | |
| used = set() | |
| ret = [] | |
| for v in reversed(d1): | |
| if v not in used: | |
| l = [] | |
| rdfs(v,l) | |
| ret.append(l) | |
| return ret | |
| def lexical_order_toporogycal_sort(_nodes,edges): | |
| nodes = list(sorted(_nodes)) | |
| in_rank = {node:0 for node in nodes} | |
| for es in edges: | |
| for t in es: | |
| in_rank[t] += 1 | |
| used = set() | |
| ret = [] | |
| for _ in xrange(len(nodes)): | |
| select = None | |
| for node in nodes: | |
| if node not in used and in_rank[node] == 0: | |
| select = node | |
| break | |
| ret.apped(select) | |
| used.add(select) | |
| for t in edges[select]: | |
| in_rank[t] -= 1 | |
| return ret | |
| class Asm(object): | |
| def __init__(self,address,binary,opecode,operand): | |
| self.address = address | |
| self.binary = binary | |
| self.opecode = opecode | |
| self.operand = operand | |
| self.indent = 0 | |
| self.comment = '' | |
| self.attribute = [] | |
| def next_address(self): | |
| if self.opecode in ('ret','jmp'): return None | |
| else: return self.address + len(self.binary) | |
| def jump_to(self): | |
| if not self.opecode.startswith('j'): return None | |
| return int(self.operand.split()[0],16) | |
| def __str__(self): | |
| indent = ' ' * self.indent | |
| address = "%x" % self.address | |
| binary = ' '.join(['%02x' % i for i in self.binary]) | |
| opecode = self.opecode if self.opecode is not None else '' | |
| operand = self.operand if self.operand is not None else '' | |
| comment = self.comment if self.comment is not None else '' | |
| attribute = str(self.attribute) if len(self.attribute) != 0 else '' | |
| ret = [] | |
| # if 'if' in self.attribute: | |
| # ret.append(' ' * (self.indent - 1) + 'if(state) {') | |
| # elif 'for' in self.attribute: | |
| # ret.append(' ' * (self.indent - 1) + 'for(init; cond; loop-expression) {') | |
| # elif 'while' in self.attribute: | |
| # ret.append(' ' * (self.indent - 1) + 'while(true) {') | |
| # elif 'else' in self.attribute: | |
| # ret.append(' ' * (self.indent - 1) + '} else {') | |
| # end_blacket_indent = self.indent - 1 | |
| # for attr in self.attribute: | |
| # if attr in ('end if','end for','end while'): end_blacket_indent += 1 | |
| # for attr in self.attribute: | |
| # if attr in ('end if','end for','end while'): | |
| # ret.append(' ' * end_blacket_indent + '} //' + attr); | |
| # end_blacket_indent -= 1 | |
| ret.append("%s%s:\t%-22s\t%s\t%s; %s %s" % \ | |
| (indent,address,binary,opecode,operand,comment,attribute) \ | |
| ) | |
| # if self.opecode[0] == 'j' and len(self.attribute) == 0: | |
| # ret[-1] += 'no attr?' | |
| return '\n'.join(ret) | |
| class AsmNode(object): | |
| def __init__(self,nodes): | |
| self._nodes = nodes | |
| self._address_list = list(chain(node.address_list() for node in nodes)) | |
| self._next_to = self.init_next_to() | |
| def init_next_to(self): | |
| next_list = [] | |
| for node in self._nodes: | |
| node_next = node.next_to() | |
| for to in node_next: | |
| if not self.contain_address(to): | |
| next_list.append(to) | |
| return next_list | |
| def head_address(self): | |
| return self._nodes[0].head_address() | |
| def address_list(self): | |
| return self._address_list | |
| def contain_address(self,address): | |
| return address in self._address_list | |
| def next_to(self): | |
| return self._next_to | |
| class AsmLeaf(object): | |
| def __init__(self,asms): | |
| self._asms = asms | |
| def head_address(self): | |
| return self._asms[0].address | |
| def address_list(self): | |
| return [asm.address for asm in self._asms] | |
| def next_to(self): | |
| l = [] | |
| jump_to = self._asms[-1].jump_to() | |
| if jump_to and jump_to not in self.address_list(): l.append(jump_to) | |
| next_address = self._asms[-1].next_address() | |
| if next_address: l.append(next_address) | |
| return l | |
| def __str__(self): | |
| l = [] | |
| l.append('start address = %x' % self.head_address()) | |
| for asm in self._asms: | |
| l.append(' ' + str(asm)) | |
| return '\n'.join(l) | |
| class Function(object): | |
| def __init__(self,asms): | |
| pass | |
| def load_assembler(file_name): | |
| part = re.compile(r"\s(?P<address>\w+):\s+(?P<binary>(\w\w\s)+)\s+((?P<opecode>\w+)(\s+(?P<operand>.*))?)?$") | |
| data = [i.strip('\n') for i in open(file_name)] | |
| ret = [] | |
| i = 0 | |
| text_section = False | |
| while i < len(data): | |
| line = data[i] | |
| if line == "Disassembly of section .text:": | |
| text_section = True | |
| i += 3 | |
| continue | |
| if text_section: | |
| if line == "": | |
| break | |
| m = part.match(line) | |
| address = int(m.group("address"),16) | |
| binary = [int(j,16) for j in m.group("binary").split()] | |
| opecode = m.group("opecode") | |
| operand = m.group("operand") | |
| asm = Asm(address,binary,opecode,operand) | |
| ret.append(asm) | |
| i += 1 | |
| return ret | |
| def divide_to_function(asms): | |
| next_instruction_address = asms[0].address | |
| current_graph = [] | |
| for asm in asms: | |
| if next_instruction_address < asm.address: | |
| yield current_graph | |
| current_graph = [] | |
| next_instruction_address = asm.address | |
| current_graph.append(asm) | |
| if asm.opecode == 'ret': | |
| continue | |
| elif asm.opecode == 'jmp': | |
| jump_to = int(asm.operand.split()[0],16) | |
| next_instruction_address = max(next_instruction_address,jump_to) | |
| elif asm.opecode.startswith('j'): | |
| jump_to = int(asm.operand.split()[0],16) | |
| to = max(jump_to,asm.next_address()) | |
| next_instruction_address = max(next_instruction_address,to) | |
| else: | |
| next_instruction_address = max(next_instruction_address,asm.next_address()) | |
| if current_graph: | |
| yield current_graph | |
| def divide_to_asm_block(asms): | |
| feature_addresses = [] | |
| for asm in asms: | |
| if asm.jump_to() is not None: | |
| feature_addresses.append(asm.jump_to()) | |
| feature_addresses.append(asm.address + len(asm.binary)) | |
| feature_addresses = list(sorted(set(feature_addresses))) | |
| i = 0 | |
| l = [] | |
| for asm in asms: | |
| if i < len(feature_addresses) and feature_addresses[i] == asm.address: | |
| #print i,"%x" % l[-1].address | |
| yield AsmLeaf(l) | |
| l = [] | |
| i += 1 | |
| l.append(asm) | |
| if l: | |
| yield AsmLeaf(l) | |
| def asm_block_to_dot_format(asm_nodes, node_dict): | |
| w = [] | |
| for k,v in node_dict.items(): | |
| w.append('"%x"[label="%s"];' % \ | |
| (k,''.join(i + '\\l' for i in str(v).split('\n')[1:])) \ | |
| ) | |
| w.append('"%s" -> "%x";' % ('start',asm_nodes[0].head_address()) ) | |
| for node in asm_nodes: | |
| for to in node.next_to(): | |
| w.append('"%x" -> "%x";' % (node.head_address(),to) ) | |
| print 'digraph asm {\n node [shape = box];\n%s \n}' % '\n'.join([' ' + i for i in w]) | |
| def build_graph(asm_nodes): | |
| if len(asm_nodes) == 1: | |
| return AsmNode(asm_nodes) | |
| edges = defaultdict(list) | |
| redges = defaultdict(list) | |
| head_address = asm_nodes[0].head_address() | |
| def add_edge(v,t): | |
| if v != head_address: edges[v].append(t) | |
| if t != head_address: edges[t].append(v) | |
| for node in asm_nodes: | |
| for to in node.next_to(): | |
| add_edge(node.head_address(),to) | |
| address_to_node = {node.head_address(): node for node in asm_nodes} | |
| edges = dict(edges) | |
| redges = dict(redges) | |
| ret = [] | |
| for result_nodes_addresses in scc(edges,redges): | |
| result_nodes = map(lambda address: address_to_node[address],result_nodes_addresses) | |
| if len(result_nodes) == 1: | |
| ret.append(result_nodes[0]) | |
| else: | |
| ret.append(analyze(result_nodes)) | |
| ret.sort(key=lambda blocks: blocks.head_address()) | |
| # ret = lexical_order_toporogycal_sort(ret) | |
| return AsmNode(ret) | |
| def analyze_function(asms): | |
| asm_nodes = [asm_block for asm_block in divide_to_asm_block(asms)] | |
| node_dict = {} | |
| for node in asm_nodes: | |
| #print node | |
| node_dict[node.head_address()] = node | |
| asm_block_to_dot_format(asm_nodes, node_dict) | |
| root = build_graph(asm_nodes) | |
| return root | |
| def main(): | |
| asms = load_assembler(sys.argv[1]) | |
| for func in divide_to_function(asms): | |
| #print "-" * 30 | |
| analyzed = analyze_function(func) | |
| #print "-" * 30 | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment