Last active
August 7, 2024 06:36
-
-
Save ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332 to your computer and use it in GitHub Desktop.
Insider Threat.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332/insider-threat.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# cyberdatascience.org \n", | |
"\n", | |
"---\n" | |
], | |
"metadata": { | |
"id": "f8nP-yljtijQ" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"\n", | |
"Data acquisition " | |
], | |
"metadata": { | |
"id": "nszA9FQ_tci8" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "sgmPgtQ5IkoI" | |
}, | |
"outputs": [], | |
"source": [ | |
"import shutil\n", | |
"import urllib.request as request\n", | |
"from contextlib import closing\n", | |
"\n", | |
"with closing(request.urlopen('ftp://ftp.sei.cmu.edu/pub/cert-data/r4.2.tar.bz2')) as r:\n", | |
" with open('r4.2.tar.bz2', 'wb') as f:\n", | |
" shutil.copyfileobj(r, f)\n", | |
"!bzip2 -d r4.2.tar.bz2\n", | |
"!!tar xvf r4.2.tar" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "pn2Gss3rIkoJ" | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"path_to_dataset = \"./r4.2/\"\n", | |
"log_types = [\"device\", \"email\", \"file\", \"logon\", \"http\"]\n", | |
"log_fields_list = [\n", | |
" [\"date\", \"user\", \"activity\"],\n", | |
" [\"date\", \"user\", \"to\", \"cc\", \"bcc\"],\n", | |
" [\"date\", \"user\", \"filename\"],\n", | |
" [\"date\", \"user\", \"activity\"],\n", | |
" [\"date\", \"user\", \"url\"],\n", | |
"]\n", | |
"features = 0\n", | |
"feature_map = {}\n", | |
"\n", | |
"\n", | |
"def add_feature(name):\n", | |
" \"\"\"Add a feature to a dictionary to be encoded.\"\"\"\n", | |
" if name not in feature_map:\n", | |
" global features\n", | |
" feature_map[name] = features\n", | |
" features += 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Data Selection and Feature Engineering" | |
], | |
"metadata": { | |
"id": "VauicJjMtsuB" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "r43uBcFTIkoL" | |
}, | |
"outputs": [], | |
"source": [ | |
"add_feature(\"Weekday_Logon_Normal\")\n", | |
"add_feature(\"Weekday_Logon_After\")\n", | |
"add_feature(\"Weekend_Logon\")\n", | |
"add_feature(\"Logoff\")\n", | |
"\n", | |
"add_feature(\"Connect_Normal\")\n", | |
"add_feature(\"Connect_After\")\n", | |
"add_feature(\"Connect_Weekend\")\n", | |
"add_feature(\"Disconnect\")\n", | |
"\n", | |
"add_feature(\"Email_In\")\n", | |
"add_feature(\"Email_Out\")\n", | |
"\n", | |
"add_feature(\"File_exe\")\n", | |
"add_feature(\"File_jpg\")\n", | |
"add_feature(\"File_zip\")\n", | |
"add_feature(\"File_txt\")\n", | |
"add_feature(\"File_doc\")\n", | |
"add_feature(\"File_pdf\")\n", | |
"add_feature(\"File_other\")\n", | |
"\n", | |
"add_feature(\"url\")\n", | |
"\n", | |
"def file_features(row):\n", | |
" \"\"\"Creates a feature recording the file extension of the file used.\"\"\"\n", | |
" if row[\"filename\"].endswith(\".exe\"):\n", | |
" return feature_map[\"File_exe\"]\n", | |
" if row[\"filename\"].endswith(\".jpg\"):\n", | |
" return feature_map[\"File_jpg\"]\n", | |
" if row[\"filename\"].endswith(\".zip\"):\n", | |
" return feature_map[\"File_zip\"]\n", | |
" if row[\"filename\"].endswith(\".txt\"):\n", | |
" return feature_map[\"File_txt\"]\n", | |
" if row[\"filename\"].endswith(\".doc\"):\n", | |
" return feature_map[\"File_doc\"]\n", | |
" if row[\"filename\"].endswith(\".pdf\"):\n", | |
" return feature_map[\"File_pdf\"]\n", | |
" else:\n", | |
" return feature_map[\"File_other\"]\n", | |
"\n", | |
"\n", | |
"def email_features(row):\n", | |
" \"\"\"Creates a feature recording whether an email has been sent externally.\"\"\"\n", | |
" outsider = False\n", | |
" if not pd.isnull(row[\"to\"]):\n", | |
" for address in row[\"to\"].split(\";\"):\n", | |
" if not address.endswith(\"dtaa.com\"):\n", | |
" outsider = True\n", | |
"\n", | |
" if not pd.isnull(row[\"cc\"]):\n", | |
" for address in row[\"cc\"].split(\";\"):\n", | |
" if not address.endswith(\"dtaa.com\"):\n", | |
" outsider = True\n", | |
"\n", | |
" if not pd.isnull(row[\"bcc\"]):\n", | |
" for address in row[\"bcc\"].split(\";\"):\n", | |
" if not address.endswith(\"dtaa.com\"):\n", | |
" outsider = True\n", | |
" if outsider:\n", | |
" return feature_map[\"Email_Out\"]\n", | |
" else:\n", | |
" return feature_map[\"Email_In\"]\n", | |
"\n", | |
"\n", | |
"def device_features(row):\n", | |
" \"\"\"Creates a feature for whether the user has connected during normal hours or otherwise.\"\"\"\n", | |
" if row[\"activity\"] == \"Connect\":\n", | |
" if row[\"date\"].weekday() < 5:\n", | |
" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n", | |
" return feature_map[\"Connect_Normal\"]\n", | |
" else:\n", | |
" return feature_map[\"Connect_After\"]\n", | |
" else:\n", | |
" return feature_map[\"Connect_Weekend\"]\n", | |
" else:\n", | |
" return feature_map[\"Disconnect\"]\n", | |
"\n", | |
"\n", | |
"def logon_features(row):\n", | |
" \"\"\"Creates a feature for whether the user logged in during normal hours or otherwise.\"\"\"\n", | |
" if row[\"activity\"] == \"Logon\":\n", | |
" if row[\"date\"].weekday() < 5:\n", | |
" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n", | |
" return feature_map[\"Weekday_Logon_Normal\"]\n", | |
" else:\n", | |
" return feature_map[\"Weekday_Logon_After\"]\n", | |
" else:\n", | |
" return feature_map[\"Weekend_Logon\"]\n", | |
" else:\n", | |
" return feature_map[\"Logoff\"]\n", | |
"\n", | |
"\n", | |
"def http_features(row):\n", | |
" \"\"\"Encodes the URL visited.\"\"\"\n", | |
" return feature_map[\"url\"]\n", | |
"\n", | |
"\n", | |
"def date_to_day(row):\n", | |
" \"\"\"Converts a full datetime to date only.\"\"\"\n", | |
" day_only = row[\"date\"].date()\n", | |
" return day_only\n", | |
"\n", | |
"log_feature_functions = [\n", | |
" device_features,\n", | |
" email_features,\n", | |
" file_features,\n", | |
" logon_features,\n", | |
" http_features,\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "Mudsnx9DIkoM" | |
}, | |
"outputs": [], | |
"source": [ | |
"dfs = []\n", | |
"for i in range(len(log_types)):\n", | |
" log_type = log_types[i]\n", | |
" log_fields = log_fields_list[i]\n", | |
" log_feature_function = log_feature_functions[i]\n", | |
" df = pd.read_csv(\n", | |
" path_to_dataset + log_type + \".csv\", usecols=log_fields, index_col=None\n", | |
" )\n", | |
" date_format = \"%m/%d/%Y %H:%M:%S\"\n", | |
" df[\"date\"] = pd.to_datetime(df[\"date\"], format=date_format)\n", | |
"\n", | |
" new_feature = df.apply(log_feature_function, axis=1)\n", | |
" df[\"feature\"] = new_feature\n", | |
"\n", | |
" cols_to_keep = [\"date\", \"user\", \"feature\"]\n", | |
" df = df[cols_to_keep]\n", | |
"\n", | |
" df[\"date\"] = df.apply(date_to_day, axis=1)\n", | |
"\n", | |
" dfs.append(df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Data processing" | |
], | |
"metadata": { | |
"id": "l_Q12WDitRSd" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "029-qTzDIkoO" | |
}, | |
"outputs": [], | |
"source": [ | |
"joint=pd.concat(dfs)\n", | |
"joint=joint.sort_values(by=\"date\")\n", | |
"threat_actors=[\"AAM0658\",\"AJR0932\",\"BDV0168\",\"BIH0745\",\"BLS0678\",\"BTL0226\",\"CAH0936\",\"DCH0843\",\"EHB0824\",\"EHD0584\",\"FMG0527\",\"FTM0406\",\"GHL0460\",\"HJB0742\",\"JMB0308\",\"JRG0207\",\"KLH0596\",\"KPC0073\",\"LJR0523\",\"LQC0479\",\"MAR0955\",\"MAS0025\",\"MCF0600\",\"MYD0978\",\"PPF0435\",\"RAB0589\",\"RGG0064\",\"RKD0604\",\"TAP0551\",\"WDD0366\",\"AAF0535\",\"ABC0174\",\"AKR0057\",\"CCL0068\",\"CEJ0109\",\"CQW0652\",\"DIB0285\",\"DRR0162\",\"EDB0714\",\"EGD0132\",\"FSC0601\",\"HBO0413\",\"HXL0968\",\"IJM0776\",\"IKR0401\",\"IUB0565\",\"JJM0203\",\"KRL0501\",\"LCC0819\",\"MDH0580\",\"MOS0047\",\"NWT0098\",\"PNL0301\",\"PSF0133\",\"RAR0725\",\"RHL0992\",\"RMW0542\",\"TNM0961\",\"VSS0154\",\"XHW0498\",\"BBS0039\",\"BSS0369\",\"CCA0046\",\"CSC0217\",\"GTD0219\",\"JGT0221\",\"JLM0364\",\"JTM0223\",\"MPM0220\",\"MSO0222\",]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "MwUrXBs3IkoQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"start_date = joint[\"date\"].iloc[0]\n", | |
"end_date = joint[\"date\"].iloc[-1]\n", | |
"time_horizon = (end_date - start_date).days + 1\n", | |
"\n", | |
"def vectorize_dataset(df):\n", | |
" \"\"\"Takes the dataset and featurizes it.\"\"\"\n", | |
" users = set(df[\"user\"].values)\n", | |
" X = np.zeros((len(users), len(feature_map), time_horizon))\n", | |
" y = np.zeros((len(users)))\n", | |
" for index, user in enumerate(users):\n", | |
" x = vectorize_user_time_series(user, df)\n", | |
" X[index, :, :] = x\n", | |
" y[index] = int(user in threat_actors)\n", | |
" return X, y\n", | |
"\n", | |
"\n", | |
"def date_to_index(date):\n", | |
" \"\"\"Indexes dates by counting the number of days since the starting date of the dataset.\"\"\"\n", | |
" return (date - start_date).days\n", | |
"\n", | |
"def extract_time_series_by_user(user_name, df):\n", | |
" \"\"\"Filters the dataframe down to a specific user.\"\"\"\n", | |
" return df[df[\"user\"] == user_name]\n", | |
"\n", | |
"\n", | |
"def vectorize_user_time_series(user_name, df):\n", | |
" \"\"\"Convert the sequence of features of a user to a vector-valued time series.\"\"\"\n", | |
" user_time_series = extract_time_series_by_user(user_name, df)\n", | |
" x = np.zeros((len(feature_map), time_horizon))\n", | |
" event_date_indices = user_time_series[\"date\"].apply(date_to_index).to_numpy()\n", | |
" event_features = user_time_series[\"feature\"].to_numpy()\n", | |
" for i in range(len(event_date_indices)):\n", | |
" x[event_features[i], event_date_indices[i]] += 1\n", | |
" return x\n", | |
"\n", | |
"X, y = vectorize_dataset(joint)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "tRG6CB2oIkoQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)\n", | |
"print(X_train.shape)\n", | |
"print(y_train.shape)\n", | |
"print(X_test.shape)\n", | |
"print(y_test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "3ueRnq-NIkoR" | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train_reshaped = X_train.reshape(\n", | |
" [X_train.shape[0], X_train.shape[1] * X_train.shape[2]]\n", | |
")\n", | |
"X_test_reshaped = X_test.reshape([X_test.shape[0], X_test.shape[1] * X_test.shape[2]])\n", | |
"X_train_normal = X_train_reshaped[y_train == 0, :]\n", | |
"print(X_train_normal.shape)\n", | |
"X_train_threat = X_train_reshaped[y_train == 1, :]\n", | |
"print(X_train_threat.shape)\n", | |
"X_test_normal = X_test_reshaped[y_test == 0, :]\n", | |
"print(X_test_normal.shape)\n", | |
"X_test_threat = X_test_reshaped[y_test == 1, :]\n", | |
"print(X_test_threat.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Choosing a model and hyper param\n", | |
"\n", | |
"observe training and testing results " | |
], | |
"metadata": { | |
"id": "W9HYW85jr3Fa" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "lqaX9a8uIkoR" | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import IsolationForest\n", | |
"\n", | |
"contamination_parameter = 0.035\n", | |
"IF = IsolationForest(\n", | |
" n_estimators=100, max_samples=256, contamination=contamination_parameter\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "Z8iBV42HIkoS" | |
}, | |
"outputs": [], | |
"source": [ | |
"IFIF.fit(X_train_reshaped)\n", | |
"normal_scores = IF.decision_function(X_train_normal)\n", | |
"\n", | |
"import matplotlib.mlab as mlab\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n", | |
"\n", | |
"normal = plt.hist(normal_scores, 50, density=True)\n", | |
"\n", | |
"plt.xlim((-0.2, 0.2))\n", | |
"plt.xlabel(\"Anomaly score\")\n", | |
"plt.ylabel(\"Percentage\")\n", | |
"plt.title(\"Distribution of anomaly score for non threats\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "QJaY7j1hIkoS" | |
}, | |
"outputs": [], | |
"source": [ | |
"anomaly_scores = IF.decision_function(X_train_threat)\n", | |
"\n", | |
"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n", | |
"\n", | |
"anomaly = plt.hist(anomaly_scores, 50, density=True)\n", | |
"\n", | |
"plt.xlim((-0.2, 0.2))\n", | |
"plt.xlabel(\"Anomaly score\")\n", | |
"plt.ylabel(\"Percentage\")\n", | |
"plt.title(\"Distribution of anomaly score for threats\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Choose a threshold and evaluate results" | |
], | |
"metadata": { | |
"id": "4eiu7r3Irxjj" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "92CmOuspIkoS" | |
}, | |
"outputs": [], | |
"source": [ | |
"cutoff = 0.13 \n", | |
"\n", | |
"from collections import Counter\n", | |
"\n", | |
"s = IF.decision_function(X_train_reshaped)\n", | |
"print(\"training rzlts\")\n", | |
"print(Counter(y_train[cutoff > s]))\n", | |
"\n", | |
"s = IF.decision_function(X_test_reshaped)\n", | |
"print(\"testing rzlts\")\n", | |
"print(Counter(y_test[cutoff > s]))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
}, | |
"colab": { | |
"name": "Insider Threat.ipynb", | |
"provenance": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment