a-agmon’s gists

a-agmon / frd_1_load_and_index.py

Last active August 18, 2019 17:26


	#Load tp file
	df_raw = pd.read_csv(r'path\capture20110810-scen1.binetflow')

	#label the infected address
	infected_addr = "147.32.84.165"
	df_raw["Bot"] = np.where(df_raw['SrcAddr'] == infected_addr, 1, 0)

a-agmon / feature_engineering.py

Created August 18, 2019 14:17


	from sklearn import preprocessing
	# Drop unnecessary columns
	df_raw = df_raw.drop(columns=['SrcAddr','DstAddr','TotBytes','Sport','Dport','StartTime','sTos','dTos'])

	#fill nulls
	for feature_name in df_raw.columns:
	if df_raw[feature_name].isnull().values.sum() > 0:
	val = "none" if df_raw[feature_name].dtypes == object else -1
	print(f"Filling nulls with value:{val} in column:{feature_name}")

a-agmon / divide_oversample.py

Created August 18, 2019 14:23


	#oversampling the minority - this will be resource intensive

	from sklearn.model_selection import train_test_split

	#divide the classes to training and test sets
	x_train, x_test, y_train, y_test \
	= train_test_split(df_raw.drop(['Bot'], axis=1), df_raw['Bot'], test_size = .2)

	#oversample the minority

a-agmon / fitthemodel.py

Last active August 18, 2019 14:31


	#fit the model

	from xgboost import XGBClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score

	model = XGBClassifier()
	model.fit(x_train, y_train)
	accuracy = accuracy_score(y_test, predictions)

a-agmon / seq1.py

Last active January 15, 2020 05:05

	first_letters = 'ABCDEF'
	second_numbers = '120'
	last_letters = 'QWOPZXML'

	# returns a string of the following format: [4 letters A-F][1 digit 0-2][3 letters QWOPZXML]
	def get_random_string():
	str1 = ''.join(random.choice(first_letters) for i in range(4))
	str2 = random.choice(second_numbers)
	str3 = ''.join(random.choice(last_letters) for i in range(3))
	return str1+str2+str3

a-agmon / seq2.py

Last active January 15, 2020 19:34

	#Build the char index that we will use to encode seqs to numbers
	#(this char index was written by Jason Brownlee from Machine Learning Mastery)
	char_index = '0abcdefghijklmnopqrstuvwxyz'
	char_index +='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	char_index += '123456789'
	char_index += '().,-/+=&$?@#!*:;_[]\|%⸏{}\"\'' + ' ' +'\\'

	char_to_int = dict((c, i) for i, c in enumerate(char_index))
	int_to_char = dict((i, c) for i, c in enumerate(char_index))

a-agmon / seq3.py

Created January 15, 2020 05:17

	#Scale our data using a MinMaxScaler that will scale
	#each number so that it will be between 0 and 1
	from sklearn.preprocessing import StandardScaler, MinMaxScaler
	scaler = MinMaxScaler()
	scaled_seqs = scaler.fit_transform(encoded_seqs)
	#Create a test and train sets of our data
	X_train = scaled_seqs[:20000]
	X_test = scaled_seqs[20000:]

a-agmon / seq4.py

Last active February 28, 2020 13:48

	from keras.models import Model, load_model
	from keras.layers import Input, Dense, Dropout
	from keras.callbacks import ModelCheckpoint, TensorBoard
	from keras import regularizers

	input_dim = X_train.shape[1] # the # features
	encoding_dim = 8 # first layer
	hidden_dim = int(encoding_dim / 2) #hideen layer

	nb_epoch = 30

a-agmon / seq5.py

Created January 15, 2020 16:15

	#encode all the data
	encoded_seqs = encode_sequence_list(seqs_ds.iloc[:,0])
	#scale it
	scaled_data = MinMaxScaler().fit_transform(encoded_seqs)
	#predict it
	predicted = autoencoder.predict(scaled_data)
	#get the error term
	mse = np.mean(np.power(scaled_data - predicted, 2), axis=1)
	#now add them to our data frame
	seqs_ds['MSE'] = mse

a-agmon / pu_est1.py

Last active March 2, 2020 12:49



	def fit_PU_estimator(X,y, hold_out_ratio, estimator):
	# The training set will be divided into a fitting-set that will be used
	# to fit the estimator in order to estimate P(s=1\|X) and a held-out set of positive samples
	# that will be used to estimate P(s=1\|y=1)
	# --------
	# find the indices of the positive/labeled elements
	assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
	positives = np.where(y == 1.)[0]

Alon Agmon a-agmon