alonstern’s gists

alonstern / dataset.py

Last active April 13, 2020 08:01

Iterates every binary in the dataset

	def _preprocess_data(self, root_directory):
	files_data = []
	files_tags = []
	# Iterates over every binary in the dataset
	for binary_path in tqdm.tqdm(glob.glob(os.path.join(root_directory, "", "binary", ""))):
	with open(binary_path, "rb") as binary_file:
	binary_elf = ELFFile(binary_file)

	# Extract the code from the binary.
	data = self._generate_data(binary_elf)

alonstern / dataset.py

Last active April 12, 2020 21:14

Extract code from an elf file

	def _generate_data(self, binary_elf: ELFFile):
	return numpy.array(list(binary_elf.get_section_by_name(".text").data()), dtype=int)

alonstern / dataset.py

Last active April 14, 2020 11:39

extract the tags from an elf file

	def _generate_tags(self, binary_elf: ELFFile):
	text_section = binary_elf.get_section_by_name(".text")

	# text_section["sh_addr"] is the address of the .text section.
	# We need the addresses of the symbols to be relative to the .text section so we subtract sh_addr from them.
	function_addresses = [function_address - text_section["sh_addr"] for function_address in self._get_function_addresses(binary_elf)]

	tags = numpy.zeros(text_section.data_size, dtype=int)
	tags[function_addresses] = 1
	return tags

alonstern / dataset.py

Created April 13, 2020 08:10

split to block without padding

	def _split_to_blocks(self, data, tags, block_size):
	data_blocks = []
	tags_blocks = []
	for file_data, file_tags in zip(data, tags):
	for start_index in range(0, len(file_data), block_size):
	data_blocks.append(file_data[start_index: start_index + block_size])
	tags_blocks.append(file_tags[start_index: start_index + block_size])

	return data_blocks, tags_blocks

alonstern / dataset.py

Last active April 15, 2020 09:57

split to blocks with padding

	def _split_to_blocks(self, data, tags, block_size, padding_size):
	data_blocks = []
	tags_blocks = []
	for file_data, file_tags in zip(data, tags):
	for start_index in range(0, len(file_data), block_size):
	data_blocks.append(self._get_padded_data(file_data, start_index, block_size, padding_size))
	tags_blocks.append(file_tags[start_index: start_index + block_size])

	return data_blocks, tags_blocks

alonstern / dataset.py

Last active April 13, 2020 08:23

FunctionIdentificationDataset class

	class FunctionIdentificationDataset(torch.utils.data.Dataset):
	def __init__(self, root_directory, block_size, padding_size):
	data, tags = self._preprocess_data(root_directory)
	self._data_blocks, self._tags_blocks = self._split_to_blocks(data, tags, block_size, padding_size)

	def __len__(self):
	return len(self._data_blocks)

	def __getitem__(self, idx):
	return self._data_blocks[idx], self._tags_blocks[idx]

alonstern / model.py

Last active April 13, 2020 12:00

embedding layer

	class CNNModel(nn.Module):
	def __init__(self, embedding_dim, vocab_size):
	super().__init__()
	self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)

	def forward(self, sample):
	embeds = self._word_embeddings(sample)

alonstern / model.py

Last active April 13, 2020 12:17

convolution layer

	class CNNModel(nn.Module):
	def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size):
	super().__init__()
	self._kernel_size = kernel_size
	self._hidden_dim = hidden_dim
	self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)
	self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim))

	def forward(self, sample):
	embeds = self._word_embeddings(sample)

alonstern / model.py

Last active April 13, 2020 12:17

output layer

	class CNNModel(nn.Module):
	def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size, tagset_size):
	super().__init__()
	self._kernel_size = kernel_size
	self._hidden_dim = hidden_dim
	self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)
	self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim))
	self._hidden2tag = nn.Linear(hidden_dim, tagset_size)

	def forward(self, sample):

alonstern / main.py

Last active April 14, 2020 06:21

split the data

	argument_parser = argparse.ArgumentParser()
	argument_parser.add_argument("dataset_path", help="Path to the directory with the binaries for the dataset "
	"(e.g ~/security.ece.cmu.edu/byteweight/elf_32")
	args = argument_parser.parse_args()

	kernel_size = 20

	# We want the padding to be in size kernel_size - 1 so the CNN output will have the same size as the tags
	dataset = FunctionIdentificationDataset(args.dataset_path, block_size=1000, padding_size=kernel_size - 1)