Skip to content

Instantly share code, notes, and snippets.

#Load tp file
df_raw = pd.read_csv(r'path\capture20110810-scen1.binetflow')
#label the infected address
infected_addr = "147.32.84.165"
df_raw["Bot"] = np.where(df_raw['SrcAddr'] == infected_addr, 1, 0)
from sklearn import preprocessing
# Drop unnecessary columns
df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos'])
#fill nulls
for feature_name in df_raw.columns:
if df_raw[feature_name].isnull().values.sum() > 0:
val = "none" if df_raw[feature_name].dtypes == object else -1
print(f"Filling nulls with value:{val} in column:{feature_name}")
#oversampling the minority - this will be resource intensive
from sklearn.model_selection import train_test_split
#divide the classes to training and test sets
x_train, x_test, y_train, y_test \
= train_test_split(df_raw.drop(['Bot'], axis=1), df_raw['Bot'], test_size = .2)
#oversample the minority
#fit the model
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
model = XGBClassifier()
model.fit(x_train, y_train)
accuracy = accuracy_score(y_test, predictions)
@a-agmon
a-agmon / seq1.py
Last active January 15, 2020 05:05
first_letters = 'ABCDEF'
second_numbers = '120'
last_letters = 'QWOPZXML'
# returns a string of the following format: [4 letters A-F][1 digit 0-2][3 letters QWOPZXML]
def get_random_string():
str1 = ''.join(random.choice(first_letters) for i in range(4))
str2 = random.choice(second_numbers)
str3 = ''.join(random.choice(last_letters) for i in range(3))
return str1+str2+str3
@a-agmon
a-agmon / seq2.py
Last active January 15, 2020 19:34
#Build the char index that we will use to encode seqs to numbers
#(this char index was written by Jason Brownlee from Machine Learning Mastery)
char_index = '0abcdefghijklmnopqrstuvwxyz'
char_index +='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
char_index += '123456789'
char_index += '().,-/+=&$?@#!*:;_[]|%⸏{}\"\'' + ' ' +'\\'
char_to_int = dict((c, i) for i, c in enumerate(char_index))
int_to_char = dict((i, c) for i, c in enumerate(char_index))
#Scale our data using a MinMaxScaler that will scale
#each number so that it will be between 0 and 1
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
scaled_seqs = scaler.fit_transform(encoded_seqs)
#Create a test and train sets of our data
X_train = scaled_seqs[:20000]
X_test = scaled_seqs[20000:]
@a-agmon
a-agmon / seq4.py
Last active February 28, 2020 13:48
from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
input_dim = X_train.shape[1] # the # features
encoding_dim = 8 # first layer
hidden_dim = int(encoding_dim / 2) #hideen layer
nb_epoch = 30
#encode all the data
encoded_seqs = encode_sequence_list(seqs_ds.iloc[:,0])
#scale it
scaled_data = MinMaxScaler().fit_transform(encoded_seqs)
#predict it
predicted = autoencoder.predict(scaled_data)
#get the error term
mse = np.mean(np.power(scaled_data - predicted, 2), axis=1)
#now add them to our data frame
seqs_ds['MSE'] = mse
def fit_PU_estimator(X,y, hold_out_ratio, estimator):
# The training set will be divided into a fitting-set that will be used
# to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples
# that will be used to estimate P(s=1|y=1)
# --------
# find the indices of the positive/labeled elements
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
positives = np.where(y == 1.)[0]