This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = CNNModel(embedding_dim=64, vocab_size=258, hidden_dim=16, tagset_size=2, kernel_size=kernel_size) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def test_model(model, test_dataset): | |
test_loader = data.DataLoader(test_dataset) | |
model.eval() | |
with torch.no_grad(): | |
all_tags = [] | |
all_tag_scores = [] | |
for sample, tags in tqdm.tqdm(test_loader): | |
sample = sample[0] | |
tags = tags[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def train_model(model, train_dataset): | |
loss_function = nn.NLLLoss() | |
optimizer = optim.Adam(model.parameters(), lr=0.001) | |
train_loader = data.DataLoader(train_dataset, shuffle=True) | |
model.train() | |
for sample, tags in tqdm.tqdm(train_loader): | |
# The loader return the data the in form of "[our sameple]" instead of "our sample". | |
# This is done for batching in case I want to train on multiple samepls at once. | |
# I don't use batching here so I do this. | |
sample = sample[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
argument_parser = argparse.ArgumentParser() | |
argument_parser.add_argument("dataset_path", help="Path to the directory with the binaries for the dataset " | |
"(e.g ~/security.ece.cmu.edu/byteweight/elf_32") | |
args = argument_parser.parse_args() | |
kernel_size = 20 | |
# We want the padding to be in size kernel_size - 1 so the CNN output will have the same size as the tags | |
dataset = FunctionIdentificationDataset(args.dataset_path, block_size=1000, padding_size=kernel_size - 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size, tagset_size): | |
super().__init__() | |
self._kernel_size = kernel_size | |
self._hidden_dim = hidden_dim | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
self._hidden2tag = nn.Linear(hidden_dim, tagset_size) | |
def forward(self, sample): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size): | |
super().__init__() | |
self._kernel_size = kernel_size | |
self._hidden_dim = hidden_dim | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
def forward(self, sample): | |
embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CNNModel(nn.Module): | |
def __init__(self, embedding_dim, vocab_size): | |
super().__init__() | |
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
def forward(self, sample): | |
embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FunctionIdentificationDataset(torch.utils.data.Dataset): | |
def __init__(self, root_directory, block_size, padding_size): | |
data, tags = self._preprocess_data(root_directory) | |
self._data_blocks, self._tags_blocks = self._split_to_blocks(data, tags, block_size, padding_size) | |
def __len__(self): | |
return len(self._data_blocks) | |
def __getitem__(self, idx): | |
return self._data_blocks[idx], self._tags_blocks[idx] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _split_to_blocks(self, data, tags, block_size, padding_size): | |
data_blocks = [] | |
tags_blocks = [] | |
for file_data, file_tags in zip(data, tags): | |
for start_index in range(0, len(file_data), block_size): | |
data_blocks.append(self._get_padded_data(file_data, start_index, block_size, padding_size)) | |
tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
return data_blocks, tags_blocks |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _split_to_blocks(self, data, tags, block_size): | |
data_blocks = [] | |
tags_blocks = [] | |
for file_data, file_tags in zip(data, tags): | |
for start_index in range(0, len(file_data), block_size): | |
data_blocks.append(file_data[start_index: start_index + block_size]) | |
tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
return data_blocks, tags_blocks |
NewerOlder