This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model = CNNModel(embedding_dim=64, vocab_size=258, hidden_dim=16, tagset_size=2, kernel_size=kernel_size) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def test_model(model, test_dataset): | |
| test_loader = data.DataLoader(test_dataset) | |
| model.eval() | |
| with torch.no_grad(): | |
| all_tags = [] | |
| all_tag_scores = [] | |
| for sample, tags in tqdm.tqdm(test_loader): | |
| sample = sample[0] | |
| tags = tags[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def train_model(model, train_dataset): | |
| loss_function = nn.NLLLoss() | |
| optimizer = optim.Adam(model.parameters(), lr=0.001) | |
| train_loader = data.DataLoader(train_dataset, shuffle=True) | |
| model.train() | |
| for sample, tags in tqdm.tqdm(train_loader): | |
| # The loader return the data the in form of "[our sameple]" instead of "our sample". | |
| # This is done for batching in case I want to train on multiple samepls at once. | |
| # I don't use batching here so I do this. | |
| sample = sample[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| argument_parser = argparse.ArgumentParser() | |
| argument_parser.add_argument("dataset_path", help="Path to the directory with the binaries for the dataset " | |
| "(e.g ~/security.ece.cmu.edu/byteweight/elf_32") | |
| args = argument_parser.parse_args() | |
| kernel_size = 20 | |
| # We want the padding to be in size kernel_size - 1 so the CNN output will have the same size as the tags | |
| dataset = FunctionIdentificationDataset(args.dataset_path, block_size=1000, padding_size=kernel_size - 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class CNNModel(nn.Module): | |
| def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size, tagset_size): | |
| super().__init__() | |
| self._kernel_size = kernel_size | |
| self._hidden_dim = hidden_dim | |
| self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
| self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
| self._hidden2tag = nn.Linear(hidden_dim, tagset_size) | |
| def forward(self, sample): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class CNNModel(nn.Module): | |
| def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size): | |
| super().__init__() | |
| self._kernel_size = kernel_size | |
| self._hidden_dim = hidden_dim | |
| self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
| self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim)) | |
| def forward(self, sample): | |
| embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class CNNModel(nn.Module): | |
| def __init__(self, embedding_dim, vocab_size): | |
| super().__init__() | |
| self._word_embeddings = nn.Embedding(vocab_size, embedding_dim) | |
| def forward(self, sample): | |
| embeds = self._word_embeddings(sample) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class FunctionIdentificationDataset(torch.utils.data.Dataset): | |
| def __init__(self, root_directory, block_size, padding_size): | |
| data, tags = self._preprocess_data(root_directory) | |
| self._data_blocks, self._tags_blocks = self._split_to_blocks(data, tags, block_size, padding_size) | |
| def __len__(self): | |
| return len(self._data_blocks) | |
| def __getitem__(self, idx): | |
| return self._data_blocks[idx], self._tags_blocks[idx] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _split_to_blocks(self, data, tags, block_size, padding_size): | |
| data_blocks = [] | |
| tags_blocks = [] | |
| for file_data, file_tags in zip(data, tags): | |
| for start_index in range(0, len(file_data), block_size): | |
| data_blocks.append(self._get_padded_data(file_data, start_index, block_size, padding_size)) | |
| tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
| return data_blocks, tags_blocks |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _split_to_blocks(self, data, tags, block_size): | |
| data_blocks = [] | |
| tags_blocks = [] | |
| for file_data, file_tags in zip(data, tags): | |
| for start_index in range(0, len(file_data), block_size): | |
| data_blocks.append(file_data[start_index: start_index + block_size]) | |
| tags_blocks.append(file_tags[start_index: start_index + block_size]) | |
| return data_blocks, tags_blocks |
NewerOlder