This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _generate_tags(self, binary_elf: ELFFile): | |
| text_section = binary_elf.get_section_by_name(".text") | |
| # text_section["sh_addr"] is the address of the .text section. | |
| # We need the addresses of the symbols to be relative to the .text section so we subtract sh_addr from them. | |
| function_addresses = [function_address - text_section["sh_addr"] for function_address in self._get_function_addresses(binary_elf)] | |
| tags = numpy.zeros(text_section.data_size, dtype=int) | |
| tags[function_addresses] = 1 | |
| return tags |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _generate_data(self, binary_elf: ELFFile): | |
| return numpy.array(list(binary_elf.get_section_by_name(".text").data()), dtype=int) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _preprocess_data(self, root_directory): | |
| files_data = [] | |
| files_tags = [] | |
| # Iterates over every binary in the dataset | |
| for binary_path in tqdm.tqdm(glob.glob(os.path.join(root_directory, "*", "binary", "*"))): | |
| with open(binary_path, "rb") as binary_file: | |
| binary_elf = ELFFile(binary_file) | |
| # Extract the code from the binary. | |
| data = self._generate_data(binary_elf) |
NewerOlder