Skip to content

Instantly share code, notes, and snippets.

@ezzeldinadel
Last active August 7, 2024 06:36
Show Gist options
  • Save ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332 to your computer and use it in GitHub Desktop.
Save ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332 to your computer and use it in GitHub Desktop.
Insider Threat.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332/insider-threat.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# cyberdatascience.org \n",
"\n",
"---\n"
],
"metadata": {
"id": "f8nP-yljtijQ"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"Data acquisition "
],
"metadata": {
"id": "nszA9FQ_tci8"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sgmPgtQ5IkoI"
},
"outputs": [],
"source": [
"import shutil\n",
"import urllib.request as request\n",
"from contextlib import closing\n",
"\n",
"with closing(request.urlopen('ftp://ftp.sei.cmu.edu/pub/cert-data/r4.2.tar.bz2')) as r:\n",
" with open('r4.2.tar.bz2', 'wb') as f:\n",
" shutil.copyfileobj(r, f)\n",
"!bzip2 -d r4.2.tar.bz2\n",
"!!tar xvf r4.2.tar"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pn2Gss3rIkoJ"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"path_to_dataset = \"./r4.2/\"\n",
"log_types = [\"device\", \"email\", \"file\", \"logon\", \"http\"]\n",
"log_fields_list = [\n",
" [\"date\", \"user\", \"activity\"],\n",
" [\"date\", \"user\", \"to\", \"cc\", \"bcc\"],\n",
" [\"date\", \"user\", \"filename\"],\n",
" [\"date\", \"user\", \"activity\"],\n",
" [\"date\", \"user\", \"url\"],\n",
"]\n",
"features = 0\n",
"feature_map = {}\n",
"\n",
"\n",
"def add_feature(name):\n",
" \"\"\"Add a feature to a dictionary to be encoded.\"\"\"\n",
" if name not in feature_map:\n",
" global features\n",
" feature_map[name] = features\n",
" features += 1"
]
},
{
"cell_type": "markdown",
"source": [
"Data Selection and Feature Engineering"
],
"metadata": {
"id": "VauicJjMtsuB"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "r43uBcFTIkoL"
},
"outputs": [],
"source": [
"add_feature(\"Weekday_Logon_Normal\")\n",
"add_feature(\"Weekday_Logon_After\")\n",
"add_feature(\"Weekend_Logon\")\n",
"add_feature(\"Logoff\")\n",
"\n",
"add_feature(\"Connect_Normal\")\n",
"add_feature(\"Connect_After\")\n",
"add_feature(\"Connect_Weekend\")\n",
"add_feature(\"Disconnect\")\n",
"\n",
"add_feature(\"Email_In\")\n",
"add_feature(\"Email_Out\")\n",
"\n",
"add_feature(\"File_exe\")\n",
"add_feature(\"File_jpg\")\n",
"add_feature(\"File_zip\")\n",
"add_feature(\"File_txt\")\n",
"add_feature(\"File_doc\")\n",
"add_feature(\"File_pdf\")\n",
"add_feature(\"File_other\")\n",
"\n",
"add_feature(\"url\")\n",
"\n",
"def file_features(row):\n",
" \"\"\"Creates a feature recording the file extension of the file used.\"\"\"\n",
" if row[\"filename\"].endswith(\".exe\"):\n",
" return feature_map[\"File_exe\"]\n",
" if row[\"filename\"].endswith(\".jpg\"):\n",
" return feature_map[\"File_jpg\"]\n",
" if row[\"filename\"].endswith(\".zip\"):\n",
" return feature_map[\"File_zip\"]\n",
" if row[\"filename\"].endswith(\".txt\"):\n",
" return feature_map[\"File_txt\"]\n",
" if row[\"filename\"].endswith(\".doc\"):\n",
" return feature_map[\"File_doc\"]\n",
" if row[\"filename\"].endswith(\".pdf\"):\n",
" return feature_map[\"File_pdf\"]\n",
" else:\n",
" return feature_map[\"File_other\"]\n",
"\n",
"\n",
"def email_features(row):\n",
" \"\"\"Creates a feature recording whether an email has been sent externally.\"\"\"\n",
" outsider = False\n",
" if not pd.isnull(row[\"to\"]):\n",
" for address in row[\"to\"].split(\";\"):\n",
" if not address.endswith(\"dtaa.com\"):\n",
" outsider = True\n",
"\n",
" if not pd.isnull(row[\"cc\"]):\n",
" for address in row[\"cc\"].split(\";\"):\n",
" if not address.endswith(\"dtaa.com\"):\n",
" outsider = True\n",
"\n",
" if not pd.isnull(row[\"bcc\"]):\n",
" for address in row[\"bcc\"].split(\";\"):\n",
" if not address.endswith(\"dtaa.com\"):\n",
" outsider = True\n",
" if outsider:\n",
" return feature_map[\"Email_Out\"]\n",
" else:\n",
" return feature_map[\"Email_In\"]\n",
"\n",
"\n",
"def device_features(row):\n",
" \"\"\"Creates a feature for whether the user has connected during normal hours or otherwise.\"\"\"\n",
" if row[\"activity\"] == \"Connect\":\n",
" if row[\"date\"].weekday() < 5:\n",
" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
" return feature_map[\"Connect_Normal\"]\n",
" else:\n",
" return feature_map[\"Connect_After\"]\n",
" else:\n",
" return feature_map[\"Connect_Weekend\"]\n",
" else:\n",
" return feature_map[\"Disconnect\"]\n",
"\n",
"\n",
"def logon_features(row):\n",
" \"\"\"Creates a feature for whether the user logged in during normal hours or otherwise.\"\"\"\n",
" if row[\"activity\"] == \"Logon\":\n",
" if row[\"date\"].weekday() < 5:\n",
" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
" return feature_map[\"Weekday_Logon_Normal\"]\n",
" else:\n",
" return feature_map[\"Weekday_Logon_After\"]\n",
" else:\n",
" return feature_map[\"Weekend_Logon\"]\n",
" else:\n",
" return feature_map[\"Logoff\"]\n",
"\n",
"\n",
"def http_features(row):\n",
" \"\"\"Encodes the URL visited.\"\"\"\n",
" return feature_map[\"url\"]\n",
"\n",
"\n",
"def date_to_day(row):\n",
" \"\"\"Converts a full datetime to date only.\"\"\"\n",
" day_only = row[\"date\"].date()\n",
" return day_only\n",
"\n",
"log_feature_functions = [\n",
" device_features,\n",
" email_features,\n",
" file_features,\n",
" logon_features,\n",
" http_features,\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Mudsnx9DIkoM"
},
"outputs": [],
"source": [
"dfs = []\n",
"for i in range(len(log_types)):\n",
" log_type = log_types[i]\n",
" log_fields = log_fields_list[i]\n",
" log_feature_function = log_feature_functions[i]\n",
" df = pd.read_csv(\n",
" path_to_dataset + log_type + \".csv\", usecols=log_fields, index_col=None\n",
" )\n",
" date_format = \"%m/%d/%Y %H:%M:%S\"\n",
" df[\"date\"] = pd.to_datetime(df[\"date\"], format=date_format)\n",
"\n",
" new_feature = df.apply(log_feature_function, axis=1)\n",
" df[\"feature\"] = new_feature\n",
"\n",
" cols_to_keep = [\"date\", \"user\", \"feature\"]\n",
" df = df[cols_to_keep]\n",
"\n",
" df[\"date\"] = df.apply(date_to_day, axis=1)\n",
"\n",
" dfs.append(df)"
]
},
{
"cell_type": "markdown",
"source": [
"Data processing"
],
"metadata": {
"id": "l_Q12WDitRSd"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "029-qTzDIkoO"
},
"outputs": [],
"source": [
"joint=pd.concat(dfs)\n",
"joint=joint.sort_values(by=\"date\")\n",
"threat_actors=[\"AAM0658\",\"AJR0932\",\"BDV0168\",\"BIH0745\",\"BLS0678\",\"BTL0226\",\"CAH0936\",\"DCH0843\",\"EHB0824\",\"EHD0584\",\"FMG0527\",\"FTM0406\",\"GHL0460\",\"HJB0742\",\"JMB0308\",\"JRG0207\",\"KLH0596\",\"KPC0073\",\"LJR0523\",\"LQC0479\",\"MAR0955\",\"MAS0025\",\"MCF0600\",\"MYD0978\",\"PPF0435\",\"RAB0589\",\"RGG0064\",\"RKD0604\",\"TAP0551\",\"WDD0366\",\"AAF0535\",\"ABC0174\",\"AKR0057\",\"CCL0068\",\"CEJ0109\",\"CQW0652\",\"DIB0285\",\"DRR0162\",\"EDB0714\",\"EGD0132\",\"FSC0601\",\"HBO0413\",\"HXL0968\",\"IJM0776\",\"IKR0401\",\"IUB0565\",\"JJM0203\",\"KRL0501\",\"LCC0819\",\"MDH0580\",\"MOS0047\",\"NWT0098\",\"PNL0301\",\"PSF0133\",\"RAR0725\",\"RHL0992\",\"RMW0542\",\"TNM0961\",\"VSS0154\",\"XHW0498\",\"BBS0039\",\"BSS0369\",\"CCA0046\",\"CSC0217\",\"GTD0219\",\"JGT0221\",\"JLM0364\",\"JTM0223\",\"MPM0220\",\"MSO0222\",]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MwUrXBs3IkoQ"
},
"outputs": [],
"source": [
"start_date = joint[\"date\"].iloc[0]\n",
"end_date = joint[\"date\"].iloc[-1]\n",
"time_horizon = (end_date - start_date).days + 1\n",
"\n",
"def vectorize_dataset(df):\n",
" \"\"\"Takes the dataset and featurizes it.\"\"\"\n",
" users = set(df[\"user\"].values)\n",
" X = np.zeros((len(users), len(feature_map), time_horizon))\n",
" y = np.zeros((len(users)))\n",
" for index, user in enumerate(users):\n",
" x = vectorize_user_time_series(user, df)\n",
" X[index, :, :] = x\n",
" y[index] = int(user in threat_actors)\n",
" return X, y\n",
"\n",
"\n",
"def date_to_index(date):\n",
" \"\"\"Indexes dates by counting the number of days since the starting date of the dataset.\"\"\"\n",
" return (date - start_date).days\n",
"\n",
"def extract_time_series_by_user(user_name, df):\n",
" \"\"\"Filters the dataframe down to a specific user.\"\"\"\n",
" return df[df[\"user\"] == user_name]\n",
"\n",
"\n",
"def vectorize_user_time_series(user_name, df):\n",
" \"\"\"Convert the sequence of features of a user to a vector-valued time series.\"\"\"\n",
" user_time_series = extract_time_series_by_user(user_name, df)\n",
" x = np.zeros((len(feature_map), time_horizon))\n",
" event_date_indices = user_time_series[\"date\"].apply(date_to_index).to_numpy()\n",
" event_features = user_time_series[\"feature\"].to_numpy()\n",
" for i in range(len(event_date_indices)):\n",
" x[event_features[i], event_date_indices[i]] += 1\n",
" return x\n",
"\n",
"X, y = vectorize_dataset(joint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "tRG6CB2oIkoQ"
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)\n",
"print(X_train.shape)\n",
"print(y_train.shape)\n",
"print(X_test.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3ueRnq-NIkoR"
},
"outputs": [],
"source": [
"X_train_reshaped = X_train.reshape(\n",
" [X_train.shape[0], X_train.shape[1] * X_train.shape[2]]\n",
")\n",
"X_test_reshaped = X_test.reshape([X_test.shape[0], X_test.shape[1] * X_test.shape[2]])\n",
"X_train_normal = X_train_reshaped[y_train == 0, :]\n",
"print(X_train_normal.shape)\n",
"X_train_threat = X_train_reshaped[y_train == 1, :]\n",
"print(X_train_threat.shape)\n",
"X_test_normal = X_test_reshaped[y_test == 0, :]\n",
"print(X_test_normal.shape)\n",
"X_test_threat = X_test_reshaped[y_test == 1, :]\n",
"print(X_test_threat.shape)"
]
},
{
"cell_type": "markdown",
"source": [
"Choosing a model and hyper param\n",
"\n",
"observe training and testing results "
],
"metadata": {
"id": "W9HYW85jr3Fa"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lqaX9a8uIkoR"
},
"outputs": [],
"source": [
"from sklearn.ensemble import IsolationForest\n",
"\n",
"contamination_parameter = 0.035\n",
"IF = IsolationForest(\n",
" n_estimators=100, max_samples=256, contamination=contamination_parameter\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Z8iBV42HIkoS"
},
"outputs": [],
"source": [
"IFIF.fit(X_train_reshaped)\n",
"normal_scores = IF.decision_function(X_train_normal)\n",
"\n",
"import matplotlib.mlab as mlab\n",
"import matplotlib.pyplot as plt\n",
"\n",
"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
"\n",
"normal = plt.hist(normal_scores, 50, density=True)\n",
"\n",
"plt.xlim((-0.2, 0.2))\n",
"plt.xlabel(\"Anomaly score\")\n",
"plt.ylabel(\"Percentage\")\n",
"plt.title(\"Distribution of anomaly score for non threats\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QJaY7j1hIkoS"
},
"outputs": [],
"source": [
"anomaly_scores = IF.decision_function(X_train_threat)\n",
"\n",
"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
"\n",
"anomaly = plt.hist(anomaly_scores, 50, density=True)\n",
"\n",
"plt.xlim((-0.2, 0.2))\n",
"plt.xlabel(\"Anomaly score\")\n",
"plt.ylabel(\"Percentage\")\n",
"plt.title(\"Distribution of anomaly score for threats\")"
]
},
{
"cell_type": "markdown",
"source": [
"Choose a threshold and evaluate results"
],
"metadata": {
"id": "4eiu7r3Irxjj"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "92CmOuspIkoS"
},
"outputs": [],
"source": [
"cutoff = 0.13 \n",
"\n",
"from collections import Counter\n",
"\n",
"s = IF.decision_function(X_train_reshaped)\n",
"print(\"training rzlts\")\n",
"print(Counter(y_train[cutoff > s]))\n",
"\n",
"s = IF.decision_function(X_test_reshaped)\n",
"print(\"testing rzlts\")\n",
"print(Counter(y_test[cutoff > s]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"colab": {
"name": "Insider Threat.ipynb",
"provenance": [],
"include_colab_link": true
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment