This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Load tp file | |
df_raw = pd.read_csv(r'path\capture20110810-scen1.binetflow') | |
#label the infected address | |
infected_addr = "147.32.84.165" | |
df_raw["Bot"] = np.where(df_raw['SrcAddr'] == infected_addr, 1, 0) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import preprocessing | |
# Drop unnecessary columns | |
df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos']) | |
#fill nulls | |
for feature_name in df_raw.columns: | |
if df_raw[feature_name].isnull().values.sum() > 0: | |
val = "none" if df_raw[feature_name].dtypes == object else -1 | |
print(f"Filling nulls with value:{val} in column:{feature_name}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#oversampling the minority - this will be resource intensive | |
from sklearn.model_selection import train_test_split | |
#divide the classes to training and test sets | |
x_train, x_test, y_train, y_test \ | |
= train_test_split(df_raw.drop(['Bot'], axis=1), df_raw['Bot'], test_size = .2) | |
#oversample the minority |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#fit the model | |
from xgboost import XGBClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
model = XGBClassifier() | |
model.fit(x_train, y_train) | |
accuracy = accuracy_score(y_test, predictions) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
first_letters = 'ABCDEF' | |
second_numbers = '120' | |
last_letters = 'QWOPZXML' | |
# returns a string of the following format: [4 letters A-F][1 digit 0-2][3 letters QWOPZXML] | |
def get_random_string(): | |
str1 = ''.join(random.choice(first_letters) for i in range(4)) | |
str2 = random.choice(second_numbers) | |
str3 = ''.join(random.choice(last_letters) for i in range(3)) | |
return str1+str2+str3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Build the char index that we will use to encode seqs to numbers | |
#(this char index was written by Jason Brownlee from Machine Learning Mastery) | |
char_index = '0abcdefghijklmnopqrstuvwxyz' | |
char_index +='ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
char_index += '123456789' | |
char_index += '().,-/+=&$?@#!*:;_[]|%⸏{}\"\'' + ' ' +'\\' | |
char_to_int = dict((c, i) for i, c in enumerate(char_index)) | |
int_to_char = dict((i, c) for i, c in enumerate(char_index)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Scale our data using a MinMaxScaler that will scale | |
#each number so that it will be between 0 and 1 | |
from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
scaler = MinMaxScaler() | |
scaled_seqs = scaler.fit_transform(encoded_seqs) | |
#Create a test and train sets of our data | |
X_train = scaled_seqs[:20000] | |
X_test = scaled_seqs[20000:] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.models import Model, load_model | |
from keras.layers import Input, Dense, Dropout | |
from keras.callbacks import ModelCheckpoint, TensorBoard | |
from keras import regularizers | |
input_dim = X_train.shape[1] # the # features | |
encoding_dim = 8 # first layer | |
hidden_dim = int(encoding_dim / 2) #hideen layer | |
nb_epoch = 30 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encode all the data | |
encoded_seqs = encode_sequence_list(seqs_ds.iloc[:,0]) | |
#scale it | |
scaled_data = MinMaxScaler().fit_transform(encoded_seqs) | |
#predict it | |
predicted = autoencoder.predict(scaled_data) | |
#get the error term | |
mse = np.mean(np.power(scaled_data - predicted, 2), axis=1) | |
#now add them to our data frame | |
seqs_ds['MSE'] = mse |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fit_PU_estimator(X,y, hold_out_ratio, estimator): | |
# The training set will be divided into a fitting-set that will be used | |
# to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples | |
# that will be used to estimate P(s=1|y=1) | |
# -------- | |
# find the indices of the positive/labeled elements | |
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y" | |
positives = np.where(y == 1.)[0] |
OlderNewer