Skip to content

Instantly share code, notes, and snippets.

@math314
Created May 26, 2014 15:15
Show Gist options
  • Save math314/1b8b01e1d90f728cc5af to your computer and use it in GitHub Desktop.
Save math314/1b8b01e1d90f728cc5af to your computer and use it in GitHub Desktop.
#coding=utf-8
import sys,os
import re
from collections import defaultdict,namedtuple
from itertools import *
def scc(edges,redges):
def dfs(v):
used.add(v)
if v in edges:
for t in edges[v]:
if t not in used:
dfs(t)
d1.append(v)
def rdfs(v,l):
used.add(v)
l.append(v)
if v in redges:
for t in redges[v]:
if t not in used:
rdfs(t,l)
used = set()
d1 = []
for v in edges:
if v not in used:
dfs(v)
used = set()
ret = []
for v in reversed(d1):
if v not in used:
l = []
rdfs(v,l)
ret.append(l)
return ret
def lexical_order_toporogycal_sort(_nodes,edges):
nodes = list(sorted(_nodes))
in_rank = {node:0 for node in nodes}
for es in edges:
for t in es:
in_rank[t] += 1
used = set()
ret = []
for _ in xrange(len(nodes)):
select = None
for node in nodes:
if node not in used and in_rank[node] == 0:
select = node
break
ret.apped(select)
used.add(select)
for t in edges[select]:
in_rank[t] -= 1
return ret
class Asm(object):
def __init__(self,address,binary,opecode,operand):
self.address = address
self.binary = binary
self.opecode = opecode
self.operand = operand
self.indent = 0
self.comment = ''
self.attribute = []
def next_address(self):
if self.opecode in ('ret','jmp'): return None
else: return self.address + len(self.binary)
def jump_to(self):
if not self.opecode.startswith('j'): return None
return int(self.operand.split()[0],16)
def __str__(self):
indent = ' ' * self.indent
address = "%x" % self.address
binary = ' '.join(['%02x' % i for i in self.binary])
opecode = self.opecode if self.opecode is not None else ''
operand = self.operand if self.operand is not None else ''
comment = self.comment if self.comment is not None else ''
attribute = str(self.attribute) if len(self.attribute) != 0 else ''
ret = []
# if 'if' in self.attribute:
# ret.append(' ' * (self.indent - 1) + 'if(state) {')
# elif 'for' in self.attribute:
# ret.append(' ' * (self.indent - 1) + 'for(init; cond; loop-expression) {')
# elif 'while' in self.attribute:
# ret.append(' ' * (self.indent - 1) + 'while(true) {')
# elif 'else' in self.attribute:
# ret.append(' ' * (self.indent - 1) + '} else {')
# end_blacket_indent = self.indent - 1
# for attr in self.attribute:
# if attr in ('end if','end for','end while'): end_blacket_indent += 1
# for attr in self.attribute:
# if attr in ('end if','end for','end while'):
# ret.append(' ' * end_blacket_indent + '} //' + attr);
# end_blacket_indent -= 1
ret.append("%s%s:\t%-22s\t%s\t%s; %s %s" % \
(indent,address,binary,opecode,operand,comment,attribute) \
)
# if self.opecode[0] == 'j' and len(self.attribute) == 0:
# ret[-1] += 'no attr?'
return '\n'.join(ret)
class BasicBlockNode(object):
def __init__(self,nodes):
self._nodes = nodes
self._address_list = list(chain(node.address_list() for node in nodes))
self._next_to = self.init_next_to()
self.depth = None
def init_next_to(self):
next_list = []
for node in self._nodes:
node_next = node.next_to()
for to in node_next:
if not self.contain_address(to):
next_list.append(to)
return next_list
def head_address(self):
return self._nodes[0].head_address()
def address_list(self):
return self._address_list
def contain_address(self,address):
return address in self._address_list
def get_nodes(self):
return self._nodes
def next_to(self):
return self._next_to
class BasicBlockLeaf(object):
def __init__(self,asms):
self._asms = asms
self.depth = None
self.attribute = []
def head_address(self):
return self._asms[0].address
def address_list(self):
return [asm.address for asm in self._asms]
def contain_address(self,address):
return address in self.address_list()
def next_to(self):
l = []
jump_to = self._asms[-1].jump_to()
if jump_to and jump_to not in self.address_list(): l.append(jump_to)
next_address = self._asms[-1].next_address()
if next_address: l.append(next_address)
return l
def jump_to(self):
return self._asms[-1].jump_to()
def __str__(self):
l = []
if self.attribute:
l.append(str(self.attribute))
l += [str(asm) for asm in self._asms]
return '\n'.join( (' ' + i for i in l ) )
class Function(object):
def __init__(self,root_node):
self._root_node = root_node
self._basic_blocks = self._expand_graph(root_node, -1)
def get_basic_blocks(self):
return self._basic_blocks
def _expand_graph(self,root,depth):
#print '\n'.join(' ' * depth + i for i in str(root).split('\n'))
root.depth = depth
if isinstance(root,BasicBlockLeaf):
return [root]
else:
return list(chain.from_iterable(self._expand_graph(node, depth + 1) for node in root.get_nodes()))
def _set_loop_attribute(self):
def dfs(node):
if isinstance(node,BasicBlockLeaf): return
children = node.get_nodes()
if node.depth >= 0:
children[0].attribute.append('loop')
children[-1].attribute.append('end loop')
for child in children:
dfs(child)
dfs(self._root_node)
def dot_format(self):
node_dict = {node.head_address(): node for node in self._basic_blocks}
w = []
for k,v in node_dict.items():
w.append('"%x"[label="%s"];' % \
(k,''.join(i + '\\l' for i in str(v).split('\n'))) \
)
w.append('"%s" -> "%x";' % ('start',self._basic_blocks[0].head_address()) )
for node in self._basic_blocks:
for to in node.next_to():
w.append('"%x" -> "%x";' % (node.head_address(),to) )
return 'digraph asm {\n node [shape = box];\n%s \n}' % '\n'.join([' ' + i for i in w])
def analyze(self):
#1. loop検出
self._set_loop_attribute()
# #2. loop continue 検出
# set_continue_statement(graph)
# #3. break検出
# set_break_statement(graph)
# #4-1. begin for,loop -> for検出
# set_for_statement(graph,loop_end_addresses)
# #4-2. loop -> while検出
# set_while_statement(graph)
# #5 if検出
# set_if_indent(graph)
# todo jump to return 検出
print str(self)
def __str__(self):
l = []
l.append('begin function...')
for leaf in self._basic_blocks:
indent = ' ' * leaf.depth
l.append('\n'.join(indent + i for i in str(leaf).split('\n')))
l.append('end function...')
return '\n\n'.join(l)
def load_assembler(file_name):
part = re.compile(r"\s(?P<address>\w+):\s+(?P<binary>(\w\w\s)+)\s+((?P<opecode>\w+)(\s+(?P<operand>.*))?)?$")
data = [i.strip('\n') for i in open(file_name)]
ret = []
i = 0
text_section = False
while i < len(data):
line = data[i]
if line == "Disassembly of section .text:":
text_section = True
i += 3
continue
if text_section:
if line == "":
break
m = part.match(line)
address = int(m.group("address"),16)
binary = [int(j,16) for j in m.group("binary").split()]
opecode = m.group("opecode")
operand = m.group("operand")
asm = Asm(address,binary,opecode,operand)
ret.append(asm)
i += 1
return ret
def divide_to_function(asms):
next_instruction_address = asms[0].address
current_graph = []
for asm in asms:
if next_instruction_address < asm.address:
yield current_graph
current_graph = []
next_instruction_address = asm.address
current_graph.append(asm)
if asm.opecode == 'ret':
continue
elif asm.opecode == 'jmp':
jump_to = int(asm.operand.split()[0],16)
next_instruction_address = max(next_instruction_address,jump_to)
elif asm.opecode.startswith('j'):
jump_to = int(asm.operand.split()[0],16)
to = max(jump_to,asm.next_address())
next_instruction_address = max(next_instruction_address,to)
else:
next_instruction_address = max(next_instruction_address,asm.next_address())
if current_graph:
yield current_graph
def divide_to_asm_block(asms):
feature_addresses = []
for asm in asms:
if asm.jump_to() is not None:
feature_addresses.append(asm.jump_to())
feature_addresses.append(asm.address + len(asm.binary))
feature_addresses = list(sorted(set(feature_addresses)))
i = 0
l = []
for asm in asms:
if i < len(feature_addresses) and feature_addresses[i] == asm.address:
#print i,"%x" % l[-1].address
yield BasicBlockLeaf(l)
l = []
i += 1
l.append(asm)
if l:
yield BasicBlockLeaf(l)
def build_graph(asm_nodes):
if len(asm_nodes) == 1:
return BasicBlockNode(asm_nodes)
asm_nodes = sorted(asm_nodes,key=lambda node: node.head_address())
edges = defaultdict(list)
redges = defaultdict(list)
head_address = asm_nodes[0].head_address()
def add_edge(v,t):
if t != head_address:
edges[v].append(t)
redges[t].append(v)
for node in asm_nodes:
for to in node.next_to():
exists = False
for _node in asm_nodes:
if to == _node.head_address():
exists = True
break
if exists:
add_edge(node.head_address(),to)
address_to_node = {node.head_address(): node for node in asm_nodes}
edges = dict(edges)
redges = dict(redges)
ret = []
for result_nodes_addresses in scc(edges,redges):
result_nodes = map(lambda address: address_to_node[address],result_nodes_addresses)
if len(result_nodes) == 1:
ret.append(result_nodes[0])
else:
ret.append(build_graph(result_nodes))
ret.sort(key=lambda blocks: blocks.head_address())
# ret = lexical_order_toporogycal_sort(ret)
return BasicBlockNode(ret)
def analyze_function(asms):
asm_nodes = [asm_block for asm_block in divide_to_asm_block(asms)]
root_node = build_graph(asm_nodes)
function = Function(root_node)
# for node in sorted_nodes:
# print '--'
# print str(node)
# print '--'
#function.dot_format()
function.analyze()
return function
def main():
asms = load_assembler(sys.argv[1])
for func in divide_to_function(asms):
#print "-" * 30
analyzed = analyze_function(func)
#print "-" * 30
if __name__ == '__main__':
main()
begin function...
8048905: 89 e5 mov ebp,esp;
8048907: 83 ec 38 sub esp,0x38;
804890a: 8b 45 10 mov eax,DWORD PTR [ebp+0x10];
804890d: 88 45 e4 mov BYTE PTR [ebp-0x1c],al;
8048910: c7 45 ec 00 00 00 00 mov DWORD PTR [ebp-0x14],0x0;
8048917: eb 43 jmp 804895c <calloc@plt+0x11c>;
['loop']
8048919: c7 44 24 08 01 00 00 00 mov DWORD PTR [esp+0x8],0x1;
8048921: 8d 45 f7 lea eax,[ebp-0x9];
8048924: 89 44 24 04 mov DWORD PTR [esp+0x4],eax;
8048928: c7 04 24 00 00 00 00 mov DWORD PTR [esp],0x0;
804892f: e8 6c fd ff ff call 80486a0 <read@plt>;
8048934: 89 45 f0 mov DWORD PTR [ebp-0x10],eax;
8048937: 0f b6 45 f7 movzx eax,BYTE PTR [ebp-0x9];
804893b: 0f be d0 movsx edx,al;
804893e: 0f b6 45 e4 movzx eax,BYTE PTR [ebp-0x1c];
8048942: 39 c2 cmp edx,eax;
8048944: 74 1e je 8048964 <calloc@plt+0x124>;
8048946: 83 7d f0 00 cmp DWORD PTR [ebp-0x10],0x0;
804894a: 74 18 je 8048964 <calloc@plt+0x124>;
804894c: 8b 45 ec mov eax,DWORD PTR [ebp-0x14];
804894f: 03 45 08 add eax,DWORD PTR [ebp+0x8];
8048952: 0f b6 55 f7 movzx edx,BYTE PTR [ebp-0x9];
8048956: 88 10 mov BYTE PTR [eax],dl;
8048958: 83 45 ec 01 add DWORD PTR [ebp-0x14],0x1;
['end loop']
804895c: 8b 45 ec mov eax,DWORD PTR [ebp-0x14];
804895f: 3b 45 0c cmp eax,DWORD PTR [ebp+0xc];
8048962: 72 b5 jb 8048919 <calloc@plt+0xd9>;
8048964: 8b 45 f0 mov eax,DWORD PTR [ebp-0x10];
8048967: c9 leave ;
8048968: c3 ret ;
end function...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment