Skip to content

Instantly share code, notes, and snippets.

@alonstern
alonstern / dataset.py
Last active April 14, 2020 11:39
extract the tags from an elf file
def _generate_tags(self, binary_elf: ELFFile):
text_section = binary_elf.get_section_by_name(".text")
# text_section["sh_addr"] is the address of the .text section.
# We need the addresses of the symbols to be relative to the .text section so we subtract sh_addr from them.
function_addresses = [function_address - text_section["sh_addr"] for function_address in self._get_function_addresses(binary_elf)]
tags = numpy.zeros(text_section.data_size, dtype=int)
tags[function_addresses] = 1
return tags
@alonstern
alonstern / dataset.py
Last active April 12, 2020 21:14
Extract code from an elf file
def _generate_data(self, binary_elf: ELFFile):
return numpy.array(list(binary_elf.get_section_by_name(".text").data()), dtype=int)
@alonstern
alonstern / dataset.py
Last active April 13, 2020 08:01
Iterates every binary in the dataset
def _preprocess_data(self, root_directory):
files_data = []
files_tags = []
# Iterates over every binary in the dataset
for binary_path in tqdm.tqdm(glob.glob(os.path.join(root_directory, "*", "binary", "*"))):
with open(binary_path, "rb") as binary_file:
binary_elf = ELFFile(binary_file)
# Extract the code from the binary.
data = self._generate_data(binary_elf)