Skip to content

Instantly share code, notes, and snippets.

@alonstern
alonstern / main.py
Created April 14, 2020 06:20
Creating the model
model = CNNModel(embedding_dim=64, vocab_size=258, hidden_dim=16, tagset_size=2, kernel_size=kernel_size)
@alonstern
alonstern / main.py
Created April 13, 2020 12:39
test the model
def test_model(model, test_dataset):
test_loader = data.DataLoader(test_dataset)
model.eval()
with torch.no_grad():
all_tags = []
all_tag_scores = []
for sample, tags in tqdm.tqdm(test_loader):
sample = sample[0]
tags = tags[0]
@alonstern
alonstern / main.py
Last active April 13, 2020 12:35
train the model
def train_model(model, train_dataset):
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_loader = data.DataLoader(train_dataset, shuffle=True)
model.train()
for sample, tags in tqdm.tqdm(train_loader):
# The loader return the data the in form of "[our sameple]" instead of "our sample".
# This is done for batching in case I want to train on multiple samepls at once.
# I don't use batching here so I do this.
sample = sample[0]
@alonstern
alonstern / main.py
Last active April 14, 2020 06:21
split the data
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument("dataset_path", help="Path to the directory with the binaries for the dataset "
"(e.g ~/security.ece.cmu.edu/byteweight/elf_32")
args = argument_parser.parse_args()
kernel_size = 20
# We want the padding to be in size kernel_size - 1 so the CNN output will have the same size as the tags
dataset = FunctionIdentificationDataset(args.dataset_path, block_size=1000, padding_size=kernel_size - 1)
@alonstern
alonstern / model.py
Last active April 13, 2020 12:17
output layer
class CNNModel(nn.Module):
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size, tagset_size):
super().__init__()
self._kernel_size = kernel_size
self._hidden_dim = hidden_dim
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim))
self._hidden2tag = nn.Linear(hidden_dim, tagset_size)
def forward(self, sample):
@alonstern
alonstern / model.py
Last active April 13, 2020 12:17
convolution layer
class CNNModel(nn.Module):
def __init__(self, embedding_dim, kernel_size, hidden_dim, vocab_size):
super().__init__()
self._kernel_size = kernel_size
self._hidden_dim = hidden_dim
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self._conv = nn.Conv2d(1, hidden_dim, kernel_size=(kernel_size, embedding_dim))
def forward(self, sample):
embeds = self._word_embeddings(sample)
@alonstern
alonstern / model.py
Last active April 13, 2020 12:00
embedding layer
class CNNModel(nn.Module):
def __init__(self, embedding_dim, vocab_size):
super().__init__()
self._word_embeddings = nn.Embedding(vocab_size, embedding_dim)
def forward(self, sample):
embeds = self._word_embeddings(sample)
@alonstern
alonstern / dataset.py
Last active April 13, 2020 08:23
FunctionIdentificationDataset class
class FunctionIdentificationDataset(torch.utils.data.Dataset):
def __init__(self, root_directory, block_size, padding_size):
data, tags = self._preprocess_data(root_directory)
self._data_blocks, self._tags_blocks = self._split_to_blocks(data, tags, block_size, padding_size)
def __len__(self):
return len(self._data_blocks)
def __getitem__(self, idx):
return self._data_blocks[idx], self._tags_blocks[idx]
@alonstern
alonstern / dataset.py
Last active April 15, 2020 09:57
split to blocks with padding
def _split_to_blocks(self, data, tags, block_size, padding_size):
data_blocks = []
tags_blocks = []
for file_data, file_tags in zip(data, tags):
for start_index in range(0, len(file_data), block_size):
data_blocks.append(self._get_padded_data(file_data, start_index, block_size, padding_size))
tags_blocks.append(file_tags[start_index: start_index + block_size])
return data_blocks, tags_blocks
@alonstern
alonstern / dataset.py
Created April 13, 2020 08:10
split to block without padding
def _split_to_blocks(self, data, tags, block_size):
data_blocks = []
tags_blocks = []
for file_data, file_tags in zip(data, tags):
for start_index in range(0, len(file_data), block_size):
data_blocks.append(file_data[start_index: start_index + block_size])
tags_blocks.append(file_tags[start_index: start_index + block_size])
return data_blocks, tags_blocks