Created
May 26, 2014 15:15
-
-
Save math314/1b8b01e1d90f728cc5af to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import sys,os | |
import re | |
from collections import defaultdict,namedtuple | |
from itertools import * | |
def scc(edges,redges): | |
def dfs(v): | |
used.add(v) | |
if v in edges: | |
for t in edges[v]: | |
if t not in used: | |
dfs(t) | |
d1.append(v) | |
def rdfs(v,l): | |
used.add(v) | |
l.append(v) | |
if v in redges: | |
for t in redges[v]: | |
if t not in used: | |
rdfs(t,l) | |
used = set() | |
d1 = [] | |
for v in edges: | |
if v not in used: | |
dfs(v) | |
used = set() | |
ret = [] | |
for v in reversed(d1): | |
if v not in used: | |
l = [] | |
rdfs(v,l) | |
ret.append(l) | |
return ret | |
def lexical_order_toporogycal_sort(_nodes,edges): | |
nodes = list(sorted(_nodes)) | |
in_rank = {node:0 for node in nodes} | |
for es in edges: | |
for t in es: | |
in_rank[t] += 1 | |
used = set() | |
ret = [] | |
for _ in xrange(len(nodes)): | |
select = None | |
for node in nodes: | |
if node not in used and in_rank[node] == 0: | |
select = node | |
break | |
ret.apped(select) | |
used.add(select) | |
for t in edges[select]: | |
in_rank[t] -= 1 | |
return ret | |
class Asm(object): | |
def __init__(self,address,binary,opecode,operand): | |
self.address = address | |
self.binary = binary | |
self.opecode = opecode | |
self.operand = operand | |
self.indent = 0 | |
self.comment = '' | |
self.attribute = [] | |
def next_address(self): | |
if self.opecode in ('ret','jmp'): return None | |
else: return self.address + len(self.binary) | |
def jump_to(self): | |
if not self.opecode.startswith('j'): return None | |
return int(self.operand.split()[0],16) | |
def __str__(self): | |
indent = ' ' * self.indent | |
address = "%x" % self.address | |
binary = ' '.join(['%02x' % i for i in self.binary]) | |
opecode = self.opecode if self.opecode is not None else '' | |
operand = self.operand if self.operand is not None else '' | |
comment = self.comment if self.comment is not None else '' | |
attribute = str(self.attribute) if len(self.attribute) != 0 else '' | |
ret = [] | |
# if 'if' in self.attribute: | |
# ret.append(' ' * (self.indent - 1) + 'if(state) {') | |
# elif 'for' in self.attribute: | |
# ret.append(' ' * (self.indent - 1) + 'for(init; cond; loop-expression) {') | |
# elif 'while' in self.attribute: | |
# ret.append(' ' * (self.indent - 1) + 'while(true) {') | |
# elif 'else' in self.attribute: | |
# ret.append(' ' * (self.indent - 1) + '} else {') | |
# end_blacket_indent = self.indent - 1 | |
# for attr in self.attribute: | |
# if attr in ('end if','end for','end while'): end_blacket_indent += 1 | |
# for attr in self.attribute: | |
# if attr in ('end if','end for','end while'): | |
# ret.append(' ' * end_blacket_indent + '} //' + attr); | |
# end_blacket_indent -= 1 | |
ret.append("%s%s:\t%-22s\t%s\t%s; %s %s" % \ | |
(indent,address,binary,opecode,operand,comment,attribute) \ | |
) | |
# if self.opecode[0] == 'j' and len(self.attribute) == 0: | |
# ret[-1] += 'no attr?' | |
return '\n'.join(ret) | |
class BasicBlockNode(object): | |
def __init__(self,nodes): | |
self._nodes = nodes | |
self._address_list = list(chain(node.address_list() for node in nodes)) | |
self._next_to = self.init_next_to() | |
self.depth = None | |
def init_next_to(self): | |
next_list = [] | |
for node in self._nodes: | |
node_next = node.next_to() | |
for to in node_next: | |
if not self.contain_address(to): | |
next_list.append(to) | |
return next_list | |
def head_address(self): | |
return self._nodes[0].head_address() | |
def address_list(self): | |
return self._address_list | |
def contain_address(self,address): | |
return address in self._address_list | |
def get_nodes(self): | |
return self._nodes | |
def next_to(self): | |
return self._next_to | |
class BasicBlockLeaf(object): | |
def __init__(self,asms): | |
self._asms = asms | |
self.depth = None | |
self.attribute = [] | |
def head_address(self): | |
return self._asms[0].address | |
def address_list(self): | |
return [asm.address for asm in self._asms] | |
def contain_address(self,address): | |
return address in self.address_list() | |
def next_to(self): | |
l = [] | |
jump_to = self._asms[-1].jump_to() | |
if jump_to and jump_to not in self.address_list(): l.append(jump_to) | |
next_address = self._asms[-1].next_address() | |
if next_address: l.append(next_address) | |
return l | |
def jump_to(self): | |
return self._asms[-1].jump_to() | |
def __str__(self): | |
l = [] | |
if self.attribute: | |
l.append(str(self.attribute)) | |
l += [str(asm) for asm in self._asms] | |
return '\n'.join( (' ' + i for i in l ) ) | |
class Function(object): | |
def __init__(self,root_node): | |
self._root_node = root_node | |
self._basic_blocks = self._expand_graph(root_node, -1) | |
def get_basic_blocks(self): | |
return self._basic_blocks | |
def _expand_graph(self,root,depth): | |
#print '\n'.join(' ' * depth + i for i in str(root).split('\n')) | |
root.depth = depth | |
if isinstance(root,BasicBlockLeaf): | |
return [root] | |
else: | |
return list(chain.from_iterable(self._expand_graph(node, depth + 1) for node in root.get_nodes())) | |
def _set_loop_attribute(self): | |
def dfs(node): | |
if isinstance(node,BasicBlockLeaf): return | |
children = node.get_nodes() | |
if node.depth >= 0: | |
children[0].attribute.append('loop') | |
children[-1].attribute.append('end loop') | |
for child in children: | |
dfs(child) | |
dfs(self._root_node) | |
def dot_format(self): | |
node_dict = {node.head_address(): node for node in self._basic_blocks} | |
w = [] | |
for k,v in node_dict.items(): | |
w.append('"%x"[label="%s"];' % \ | |
(k,''.join(i + '\\l' for i in str(v).split('\n'))) \ | |
) | |
w.append('"%s" -> "%x";' % ('start',self._basic_blocks[0].head_address()) ) | |
for node in self._basic_blocks: | |
for to in node.next_to(): | |
w.append('"%x" -> "%x";' % (node.head_address(),to) ) | |
return 'digraph asm {\n node [shape = box];\n%s \n}' % '\n'.join([' ' + i for i in w]) | |
def analyze(self): | |
#1. loop検出 | |
self._set_loop_attribute() | |
# #2. loop continue 検出 | |
# set_continue_statement(graph) | |
# #3. break検出 | |
# set_break_statement(graph) | |
# #4-1. begin for,loop -> for検出 | |
# set_for_statement(graph,loop_end_addresses) | |
# #4-2. loop -> while検出 | |
# set_while_statement(graph) | |
# #5 if検出 | |
# set_if_indent(graph) | |
# todo jump to return 検出 | |
print str(self) | |
def __str__(self): | |
l = [] | |
l.append('begin function...') | |
for leaf in self._basic_blocks: | |
indent = ' ' * leaf.depth | |
l.append('\n'.join(indent + i for i in str(leaf).split('\n'))) | |
l.append('end function...') | |
return '\n\n'.join(l) | |
def load_assembler(file_name): | |
part = re.compile(r"\s(?P<address>\w+):\s+(?P<binary>(\w\w\s)+)\s+((?P<opecode>\w+)(\s+(?P<operand>.*))?)?$") | |
data = [i.strip('\n') for i in open(file_name)] | |
ret = [] | |
i = 0 | |
text_section = False | |
while i < len(data): | |
line = data[i] | |
if line == "Disassembly of section .text:": | |
text_section = True | |
i += 3 | |
continue | |
if text_section: | |
if line == "": | |
break | |
m = part.match(line) | |
address = int(m.group("address"),16) | |
binary = [int(j,16) for j in m.group("binary").split()] | |
opecode = m.group("opecode") | |
operand = m.group("operand") | |
asm = Asm(address,binary,opecode,operand) | |
ret.append(asm) | |
i += 1 | |
return ret | |
def divide_to_function(asms): | |
next_instruction_address = asms[0].address | |
current_graph = [] | |
for asm in asms: | |
if next_instruction_address < asm.address: | |
yield current_graph | |
current_graph = [] | |
next_instruction_address = asm.address | |
current_graph.append(asm) | |
if asm.opecode == 'ret': | |
continue | |
elif asm.opecode == 'jmp': | |
jump_to = int(asm.operand.split()[0],16) | |
next_instruction_address = max(next_instruction_address,jump_to) | |
elif asm.opecode.startswith('j'): | |
jump_to = int(asm.operand.split()[0],16) | |
to = max(jump_to,asm.next_address()) | |
next_instruction_address = max(next_instruction_address,to) | |
else: | |
next_instruction_address = max(next_instruction_address,asm.next_address()) | |
if current_graph: | |
yield current_graph | |
def divide_to_asm_block(asms): | |
feature_addresses = [] | |
for asm in asms: | |
if asm.jump_to() is not None: | |
feature_addresses.append(asm.jump_to()) | |
feature_addresses.append(asm.address + len(asm.binary)) | |
feature_addresses = list(sorted(set(feature_addresses))) | |
i = 0 | |
l = [] | |
for asm in asms: | |
if i < len(feature_addresses) and feature_addresses[i] == asm.address: | |
#print i,"%x" % l[-1].address | |
yield BasicBlockLeaf(l) | |
l = [] | |
i += 1 | |
l.append(asm) | |
if l: | |
yield BasicBlockLeaf(l) | |
def build_graph(asm_nodes): | |
if len(asm_nodes) == 1: | |
return BasicBlockNode(asm_nodes) | |
asm_nodes = sorted(asm_nodes,key=lambda node: node.head_address()) | |
edges = defaultdict(list) | |
redges = defaultdict(list) | |
head_address = asm_nodes[0].head_address() | |
def add_edge(v,t): | |
if t != head_address: | |
edges[v].append(t) | |
redges[t].append(v) | |
for node in asm_nodes: | |
for to in node.next_to(): | |
exists = False | |
for _node in asm_nodes: | |
if to == _node.head_address(): | |
exists = True | |
break | |
if exists: | |
add_edge(node.head_address(),to) | |
address_to_node = {node.head_address(): node for node in asm_nodes} | |
edges = dict(edges) | |
redges = dict(redges) | |
ret = [] | |
for result_nodes_addresses in scc(edges,redges): | |
result_nodes = map(lambda address: address_to_node[address],result_nodes_addresses) | |
if len(result_nodes) == 1: | |
ret.append(result_nodes[0]) | |
else: | |
ret.append(build_graph(result_nodes)) | |
ret.sort(key=lambda blocks: blocks.head_address()) | |
# ret = lexical_order_toporogycal_sort(ret) | |
return BasicBlockNode(ret) | |
def analyze_function(asms): | |
asm_nodes = [asm_block for asm_block in divide_to_asm_block(asms)] | |
root_node = build_graph(asm_nodes) | |
function = Function(root_node) | |
# for node in sorted_nodes: | |
# print '--' | |
# print str(node) | |
# print '--' | |
#function.dot_format() | |
function.analyze() | |
return function | |
def main(): | |
asms = load_assembler(sys.argv[1]) | |
for func in divide_to_function(asms): | |
#print "-" * 30 | |
analyzed = analyze_function(func) | |
#print "-" * 30 | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
begin function... | |
8048905: 89 e5 mov ebp,esp; | |
8048907: 83 ec 38 sub esp,0x38; | |
804890a: 8b 45 10 mov eax,DWORD PTR [ebp+0x10]; | |
804890d: 88 45 e4 mov BYTE PTR [ebp-0x1c],al; | |
8048910: c7 45 ec 00 00 00 00 mov DWORD PTR [ebp-0x14],0x0; | |
8048917: eb 43 jmp 804895c <calloc@plt+0x11c>; | |
['loop'] | |
8048919: c7 44 24 08 01 00 00 00 mov DWORD PTR [esp+0x8],0x1; | |
8048921: 8d 45 f7 lea eax,[ebp-0x9]; | |
8048924: 89 44 24 04 mov DWORD PTR [esp+0x4],eax; | |
8048928: c7 04 24 00 00 00 00 mov DWORD PTR [esp],0x0; | |
804892f: e8 6c fd ff ff call 80486a0 <read@plt>; | |
8048934: 89 45 f0 mov DWORD PTR [ebp-0x10],eax; | |
8048937: 0f b6 45 f7 movzx eax,BYTE PTR [ebp-0x9]; | |
804893b: 0f be d0 movsx edx,al; | |
804893e: 0f b6 45 e4 movzx eax,BYTE PTR [ebp-0x1c]; | |
8048942: 39 c2 cmp edx,eax; | |
8048944: 74 1e je 8048964 <calloc@plt+0x124>; | |
8048946: 83 7d f0 00 cmp DWORD PTR [ebp-0x10],0x0; | |
804894a: 74 18 je 8048964 <calloc@plt+0x124>; | |
804894c: 8b 45 ec mov eax,DWORD PTR [ebp-0x14]; | |
804894f: 03 45 08 add eax,DWORD PTR [ebp+0x8]; | |
8048952: 0f b6 55 f7 movzx edx,BYTE PTR [ebp-0x9]; | |
8048956: 88 10 mov BYTE PTR [eax],dl; | |
8048958: 83 45 ec 01 add DWORD PTR [ebp-0x14],0x1; | |
['end loop'] | |
804895c: 8b 45 ec mov eax,DWORD PTR [ebp-0x14]; | |
804895f: 3b 45 0c cmp eax,DWORD PTR [ebp+0xc]; | |
8048962: 72 b5 jb 8048919 <calloc@plt+0xd9>; | |
8048964: 8b 45 f0 mov eax,DWORD PTR [ebp-0x10]; | |
8048967: c9 leave ; | |
8048968: c3 ret ; | |
end function... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment