This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _preprocess_data(self, root_directory): | |
files_data = [] | |
files_tags = [] | |
# Iterates over every binary in the dataset | |
for binary_path in tqdm.tqdm(glob.glob(os.path.join(root_directory, "*", "binary", "*"))): | |
with open(binary_path, "rb") as binary_file: | |
binary_elf = ELFFile(binary_file) | |
# Extract the code from the binary. | |
data = self._generate_data(binary_elf) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _generate_data(self, binary_elf: ELFFile): | |
return numpy.array(list(binary_elf.get_section_by_name(".text").data()), dtype=int) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _generate_tags(self, binary_elf: ELFFile): | |
text_section = binary_elf.get_section_by_name(".text") | |
# text_section["sh_addr"] is the address of the .text section. | |
# We need the addresses of the symbols to be relative to the .text section so we subtract sh_addr from them. | |
function_addresses = [function_address - text_section["sh_addr"] for function_address in self._get_function_addresses(binary_elf)] | |
tags = numpy.zeros(text_section.data_size, dtype=int) | |
tags[function_addresses] = 1 | |
return tags |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _split_to_blocks(self, data, tags, block_size): | |
data_blocks = [] | |
tags_blocks = [] | |
for file_data, file_tags in zip(data, tags): | |
for start_index in range(0, len(file_data), block_size): | |
data_blocks.append(file_data[start_index: start_index + block_size]) | |
tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
return data_blocks, tags_blocks |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _split_to_blocks(self, data, tags, block_size, padding_size): | |
data_blocks = [] | |
tags_blocks = [] | |
for file_data, file_tags in zip(data, tags): | |
for start_index in range(0, len(file_data), block_size): | |
data_blocks.append(self._get_padded_data(file_data, start_index, block_size, padding_size)) | |
tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
return data_blocks, tags_blocks |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FunctionIdentificationDataset(torch.utils.data.Dataset): | |
def __init__(self, root_directory, block_size, padding_size): | |
data, tags = self._preprocess_data(root_directory) | |
self._data_blocks, self._tags_blocks = self._split_to_blocks(data, tags, block_size, padding_size) | |
def __len__(self): | |
return len(self._data_blocks) | |
def __getitem__(self, idx): | |
return self._data_blocks[idx], self._tags_blocks[idx] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, vocab_size): | |
super().__init__() | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
def forward(self, sample): | |
embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size): | |
super().__init__() | |
self._kernel_size = kernel_size | |
self._hidden_dim = hidden_dim | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
def forward(self, sample): | |
embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size, tagset_size): | |
super().__init__() | |
self._kernel_size = kernel_size | |
self._hidden_dim = hidden_dim | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
self._hidden2tag = nn.Linear(hidden_dim, tagset_size) | |
def forward(self, sample): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
argument_parser = argparse.ArgumentParser() | |
argument_parser.add_argument("dataset_path", help="Path to the directory with the binaries for the dataset " | |
"(e.g ~/security.ece.cmu.edu/byteweight/elf_32") | |
args = argument_parser.parse_args() | |
kernel_size = 20 | |
# We want the padding to be in size kernel_size - 1 so the CNN output will have the same size as the tags | |
dataset = FunctionIdentificationDataset(args.dataset_path, block_size=1000, padding_size=kernel_size - 1) |
OlderNewer