Created
February 22, 2018 10:10
-
-
Save spicyramen/f8fa050db7f24b56ff10fa7295f44be4 to your computer and use it in GitHub Desktop.
Binary classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Honeypot classifier. Based on https://www.kaggle.com/mrklees/applying-keras-scikit-learn-to-titanic""" | |
import pandas as pd | |
import numpy as np | |
from keras.utils import to_categorical | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout | |
from sklearn.model_selection import train_test_split | |
pd.options.mode.chained_assignment = None | |
FILENAME = '../data/honeypot_dataset.csv' | |
ALL_FEATURES = ['ruri', | |
'ruri_user', | |
'ruri_domain', | |
'from_user', | |
'from_domain', | |
'from_tag', | |
'to_user', | |
'contact_user', | |
'callid', | |
'content_type', | |
'user_agent', | |
'source_ip', | |
'source_port', | |
'destination_port', | |
'contact_ip', | |
'contact_port'] | |
CATEGORICAL = ['ruri', | |
'ruri_user', | |
'ruri_domain', | |
'from_user', | |
'from_domain', | |
'from_tag', | |
'to_user', | |
'contact_user', | |
'callid', | |
'content_type', | |
'user_agent', | |
'source_ip', | |
'contact_ip'] | |
LABEL_ENCODED_FEATURES = ['ruri_user', 'from_user', 'to_user', 'contact_user', 'user_agent', 'source_ip', 'contact_ip'] | |
CONTINUOUS = ['source_port', 'destination_port', 'contact_port'] | |
DROPPED_FEATURES = ['destination_port', 'ruri', 'ruri_domain', 'from_domain', 'callid', 'from_tag', 'content_type'] | |
FEATURES = list(set(ALL_FEATURES) - set(DROPPED_FEATURES)) | |
SIP_SCANNERS = ('sipcli/v1.8', 'pplsip') | |
_USER_AGENT = 'user_agent' | |
LABEL = 'toll_fraud' | |
MODEL_NAME = 'honeypot.json' | |
MODEL_WEIGHTS = 'honeypot.h5' | |
def save_model(model): | |
""" | |
:param model: A Keras model. | |
:return: | |
""" | |
model_json = model.to_json() | |
with open(MODEL_NAME, "w") as json_file: | |
json_file.write(model_json) | |
# serialize weights to HDF5 | |
model.save_weights(MODEL_WEIGHTS) | |
print("Saved model to disk") | |
def _to_string(value): | |
"""Returns a string type based on value variable type. | |
Since we handle multiple languages we need to return a string type to write | |
in file human readable character. | |
Args: | |
value: (None, str or unicode) | |
Returns: | |
A str or None if no input. | |
""" | |
if not value: | |
print('Empty value') | |
return None | |
if isinstance(value, unicode): | |
return value.encode('utf-8') | |
else: | |
return str(value) | |
class HoneypotData(object): | |
"""Honeypot Data | |
This class will contain the entire data pipeline from raw data to prepared | |
numpy arrays. It's eventually inherited by the model class, but is left | |
distinct for readbility and logical organization. | |
""" | |
filepath = '../data/' | |
train_fn = 'honeypot_dataset.csv' | |
test_fn = 'honeypot_test.csv' | |
def __init__(self): | |
""" Initializes and process all pipeline.""" | |
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc() | |
def preproc(self): | |
"""Process data pipeline.""" | |
# Import Data & Drop irrelevant features. | |
dataset = self.import_data(self.train_fn, drop=True) | |
# Fix NA values. | |
dataset = self.fix_na(dataset) | |
# Feature Engineering. | |
dataset = self.engineer_features(dataset) | |
# Process Categorical values processing. | |
dataset = self.process_categorical(dataset) | |
# Select all columns except target. | |
X = dataset[dataset.columns.difference([LABEL])] | |
y = dataset[LABEL] | |
# Split training and test datasets. | |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y) | |
return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values | |
def import_data(self, filename, drop=True): | |
"""Import that data and then split it into train/test sets. Make sure to stratify. | |
This stratify parameter makes a split so that the proportion of values in the sample produced will be the same | |
as the proportion of values provided to parameter stratify. | |
For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros | |
and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's. | |
Args: | |
filename: (str) filename to read data from as Pandas.dataframe. | |
drop: (bool) Drop features. | |
Returns: | |
A Pandas.dataframe. | |
""" | |
dataset = pd.read_csv('%s%s' % (self.filepath, filename)) | |
if drop: | |
# Drop irrelevant features. | |
return dataset.drop(DROPPED_FEATURES, axis=1) | |
else: | |
return dataset | |
def fix_na(self, dataset): | |
"""Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type.""" | |
na_vars = {"contact_user": "test", "content_type": "application/sdp"} | |
return dataset.fillna(na_vars) | |
def engineer_features(self, dataset): | |
"""Modify some features.""" | |
dataset['is_scanner'] = 0 # Initialize to no/0 is scanner. | |
dataset['is_scanner'].loc[dataset[_USER_AGENT].isin(SIP_SCANNERS)] = 1 # The rest are 0. | |
return dataset | |
def process_categorical(self, dataset): | |
""" | |
:param dataset: | |
:return: | |
""" | |
# Label Encoding. | |
for categorical_feature in LABEL_ENCODED_FEATURES: | |
categorical_feature = _to_string(categorical_feature) | |
dataset[categorical_feature] = dataset[categorical_feature].astype('category') | |
dataset[categorical_feature + '_cat'] = dataset[categorical_feature].cat.codes | |
# Drop previous values. | |
dataset = dataset.drop(LABEL_ENCODED_FEATURES, axis=1) | |
# Rename back categories. {'to_user_cat': 'to_user' ... } | |
dataset.rename(columns={c + '_cat': c for c in LABEL_ENCODED_FEATURES}, inplace=True) | |
return dataset | |
def preproc_test(self): | |
"""Pre-process testing data.""" | |
# Import data | |
test = self.import_data(self.test_fn, drop=True) | |
# Extract labels. | |
# Drop previous values. | |
test[_USER_AGENT] = test[_USER_AGENT].astype('category') | |
test['user_agent_cat'] = test[_USER_AGENT].cat.codes | |
print test[['user_agent_cat', _USER_AGENT]].head() | |
test = test.drop([_USER_AGENT], axis=1) | |
test.rename(columns={'user_agent_cat': _USER_AGENT}, inplace=True) | |
labels = test.user_agent.values | |
# Fix NA values. | |
test = self.fix_na(test) | |
# Feature Engineering | |
test = self.engineer_features(test) | |
# Process Categorical values processing. | |
test = self.process_categorical(test) | |
print test.columns | |
return labels, test | |
class HoneypotKeras(HoneypotData): | |
"""Main classifier model based in Keras.""" | |
def __init__(self): | |
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc() | |
self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid) | |
self.feature_count = self.X_train.shape[1] | |
self.history = [] | |
def build_model(self): | |
print 'Feature count: %d' % self.feature_count | |
model = Sequential() | |
model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu')) | |
model.add(Dropout(0.1)) | |
model.add(Dense(1028, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(1028, activation='relu')) | |
model.add(Dropout(0.3)) | |
model.add(Dense(512, activation='relu')) | |
model.add(Dropout(0.4)) | |
model.add(Dense(2, activation='sigmoid')) | |
model.compile(optimizer='adam', | |
loss='binary_crossentropy', | |
metrics=['accuracy']) | |
self.model = model | |
def fit(self, lr=0.001, epochs=1): | |
self.model.optimizer.lr = lr | |
hist = self.model.fit(self.X_train, self.y_train, | |
batch_size=32, epochs=epochs, | |
verbose=1, validation_data=(self.X_valid, self.y_valid), | |
) | |
self.history.append(hist) | |
def prepare_submission(self, name): | |
labels, test_data = self.preproc_test() | |
predictions = self.model.predict(test_data) | |
subm = pd.DataFrame(np.column_stack([labels, np.around(predictions[:, 1])]).astype('int32'), | |
columns=[_USER_AGENT, LABEL]) | |
subm.to_csv('%s.csv' % name, index=False) | |
return subm | |
model = HoneypotKeras() | |
model.build_model() | |
model.fit(lr=0.01, epochs=1) | |
#model.fit(lr=0.001, epochs=10) | |
model.prepare_submission('keras') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment