ezzeldinadel · August 7, 2024 06:36
diff --git a/insider-threat.ipynb b/insider-threat.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332/insider-threat.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# cyberdatascience.org \n",
        "\n",
        "---\n"
      ],
      "metadata": {
        "id": "f8nP-yljtijQ"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "Data acquisition "
      ],
      "metadata": {
        "id": "nszA9FQ_tci8"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sgmPgtQ5IkoI"
      },
      "outputs": [],
      "source": [
        "import shutil\n",
        "import urllib.request as request\n",
        "from contextlib import closing\n",
        "\n",
        "with closing(request.urlopen('ftp://ftp.sei.cmu.edu/pub/cert-data/r4.2.tar.bz2')) as r:\n",
        "    with open('r4.2.tar.bz2', 'wb') as f:\n",
        "        shutil.copyfileobj(r, f)\n",
        "!bzip2 -d r4.2.tar.bz2\n",
        "!!tar xvf r4.2.tar"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pn2Gss3rIkoJ"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "path_to_dataset = \"./r4.2/\"\n",
        "log_types = [\"device\", \"email\", \"file\", \"logon\", \"http\"]\n",
        "log_fields_list = [\n",
        "    [\"date\", \"user\", \"activity\"],\n",
        "    [\"date\", \"user\", \"to\", \"cc\", \"bcc\"],\n",
        "    [\"date\", \"user\", \"filename\"],\n",
        "    [\"date\", \"user\", \"activity\"],\n",
        "    [\"date\", \"user\", \"url\"],\n",
        "]\n",
        "features = 0\n",
        "feature_map = {}\n",
        "\n",
        "\n",
        "def add_feature(name):\n",
        "    \"\"\"Add a feature to a dictionary to be encoded.\"\"\"\n",
        "    if name not in feature_map:\n",
        "        global features\n",
        "        feature_map[name] = features\n",
        "        features += 1"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Data Selection and Feature Engineering"
      ],
      "metadata": {
        "id": "VauicJjMtsuB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "r43uBcFTIkoL"
      },
      "outputs": [],
      "source": [
        "add_feature(\"Weekday_Logon_Normal\")\n",
        "add_feature(\"Weekday_Logon_After\")\n",
        "add_feature(\"Weekend_Logon\")\n",
        "add_feature(\"Logoff\")\n",
        "\n",
        "add_feature(\"Connect_Normal\")\n",
        "add_feature(\"Connect_After\")\n",
        "add_feature(\"Connect_Weekend\")\n",
        "add_feature(\"Disconnect\")\n",
        "\n",
        "add_feature(\"Email_In\")\n",
        "add_feature(\"Email_Out\")\n",
        "\n",
        "add_feature(\"File_exe\")\n",
        "add_feature(\"File_jpg\")\n",
        "add_feature(\"File_zip\")\n",
        "add_feature(\"File_txt\")\n",
        "add_feature(\"File_doc\")\n",
        "add_feature(\"File_pdf\")\n",
        "add_feature(\"File_other\")\n",
        "\n",
        "add_feature(\"url\")\n",
        "\n",
        "def file_features(row):\n",
        "    \"\"\"Creates a feature recording the file extension of the file used.\"\"\"\n",
        "    if row[\"filename\"].endswith(\".exe\"):\n",
        "        return feature_map[\"File_exe\"]\n",
        "    if row[\"filename\"].endswith(\".jpg\"):\n",
        "        return feature_map[\"File_jpg\"]\n",
        "    if row[\"filename\"].endswith(\".zip\"):\n",
        "        return feature_map[\"File_zip\"]\n",
        "    if row[\"filename\"].endswith(\".txt\"):\n",
        "        return feature_map[\"File_txt\"]\n",
        "    if row[\"filename\"].endswith(\".doc\"):\n",
        "        return feature_map[\"File_doc\"]\n",
        "    if row[\"filename\"].endswith(\".pdf\"):\n",
        "        return feature_map[\"File_pdf\"]\n",
        "    else:\n",
        "        return feature_map[\"File_other\"]\n",
        "\n",
        "\n",
        "def email_features(row):\n",
        "    \"\"\"Creates a feature recording whether an email has been sent externally.\"\"\"\n",
        "    outsider = False\n",
        "    if not pd.isnull(row[\"to\"]):\n",
        "        for address in row[\"to\"].split(\";\"):\n",
        "            if not address.endswith(\"dtaa.com\"):\n",
        "                outsider = True\n",
        "\n",
        "    if not pd.isnull(row[\"cc\"]):\n",
        "        for address in row[\"cc\"].split(\";\"):\n",
        "            if not address.endswith(\"dtaa.com\"):\n",
        "                outsider = True\n",
        "\n",
        "    if not pd.isnull(row[\"bcc\"]):\n",
        "        for address in row[\"bcc\"].split(\";\"):\n",
        "            if not address.endswith(\"dtaa.com\"):\n",
        "                outsider = True\n",
        "    if outsider:\n",
        "        return feature_map[\"Email_Out\"]\n",
        "    else:\n",
        "        return feature_map[\"Email_In\"]\n",
        "\n",
        "\n",
        "def device_features(row):\n",
        "    \"\"\"Creates a feature for whether the user has connected during normal hours or otherwise.\"\"\"\n",
        "    if row[\"activity\"] == \"Connect\":\n",
        "        if row[\"date\"].weekday() < 5:\n",
        "            if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
        "                return feature_map[\"Connect_Normal\"]\n",
        "            else:\n",
        "                return feature_map[\"Connect_After\"]\n",
        "        else:\n",
        "            return feature_map[\"Connect_Weekend\"]\n",
        "    else:\n",
        "        return feature_map[\"Disconnect\"]\n",
        "\n",
        "\n",
        "def logon_features(row):\n",
        "    \"\"\"Creates a feature for whether the user logged in during normal hours or otherwise.\"\"\"\n",
        "    if row[\"activity\"] == \"Logon\":\n",
        "        if row[\"date\"].weekday() < 5:\n",
        "            if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
        "                return feature_map[\"Weekday_Logon_Normal\"]\n",
        "            else:\n",
        "                return feature_map[\"Weekday_Logon_After\"]\n",
        "        else:\n",
        "            return feature_map[\"Weekend_Logon\"]\n",
        "    else:\n",
        "        return feature_map[\"Logoff\"]\n",
        "\n",
        "\n",
        "def http_features(row):\n",
        "    \"\"\"Encodes the URL visited.\"\"\"\n",
        "    return feature_map[\"url\"]\n",
        "\n",
        "\n",
        "def date_to_day(row):\n",
        "    \"\"\"Converts a full datetime to date only.\"\"\"\n",
        "    day_only = row[\"date\"].date()\n",
        "    return day_only\n",
        "\n",
        "log_feature_functions = [\n",
        "    device_features,\n",
        "    email_features,\n",
        "    file_features,\n",
        "    logon_features,\n",
        "    http_features,\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Mudsnx9DIkoM"
      },
      "outputs": [],
      "source": [
        "dfs = []\n",
        "for i in range(len(log_types)):\n",
        "    log_type = log_types[i]\n",
        "    log_fields = log_fields_list[i]\n",
        "    log_feature_function = log_feature_functions[i]\n",
        "    df = pd.read_csv(\n",
        "        path_to_dataset + log_type + \".csv\", usecols=log_fields, index_col=None\n",
        "    )\n",
        "    date_format = \"%m/%d/%Y %H:%M:%S\"\n",
        "    df[\"date\"] = pd.to_datetime(df[\"date\"], format=date_format)\n",
        "\n",
        "    new_feature = df.apply(log_feature_function, axis=1)\n",
        "    df[\"feature\"] = new_feature\n",
        "\n",
        "    cols_to_keep = [\"date\", \"user\", \"feature\"]\n",
        "    df = df[cols_to_keep]\n",
        "\n",
        "    df[\"date\"] = df.apply(date_to_day, axis=1)\n",
        "\n",
        "    dfs.append(df)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Data processing"
      ],
      "metadata": {
        "id": "l_Q12WDitRSd"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "029-qTzDIkoO"
      },
      "outputs": [],
      "source": [
        "joint=pd.concat(dfs)\n",
        "joint=joint.sort_values(by=\"date\")\n",
        "threat_actors=[\"AAM0658\",\"AJR0932\",\"BDV0168\",\"BIH0745\",\"BLS0678\",\"BTL0226\",\"CAH0936\",\"DCH0843\",\"EHB0824\",\"EHD0584\",\"FMG0527\",\"FTM0406\",\"GHL0460\",\"HJB0742\",\"JMB0308\",\"JRG0207\",\"KLH0596\",\"KPC0073\",\"LJR0523\",\"LQC0479\",\"MAR0955\",\"MAS0025\",\"MCF0600\",\"MYD0978\",\"PPF0435\",\"RAB0589\",\"RGG0064\",\"RKD0604\",\"TAP0551\",\"WDD0366\",\"AAF0535\",\"ABC0174\",\"AKR0057\",\"CCL0068\",\"CEJ0109\",\"CQW0652\",\"DIB0285\",\"DRR0162\",\"EDB0714\",\"EGD0132\",\"FSC0601\",\"HBO0413\",\"HXL0968\",\"IJM0776\",\"IKR0401\",\"IUB0565\",\"JJM0203\",\"KRL0501\",\"LCC0819\",\"MDH0580\",\"MOS0047\",\"NWT0098\",\"PNL0301\",\"PSF0133\",\"RAR0725\",\"RHL0992\",\"RMW0542\",\"TNM0961\",\"VSS0154\",\"XHW0498\",\"BBS0039\",\"BSS0369\",\"CCA0046\",\"CSC0217\",\"GTD0219\",\"JGT0221\",\"JLM0364\",\"JTM0223\",\"MPM0220\",\"MSO0222\",]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MwUrXBs3IkoQ"
      },
      "outputs": [],
      "source": [
        "start_date = joint[\"date\"].iloc[0]\n",
        "end_date = joint[\"date\"].iloc[-1]\n",
        "time_horizon = (end_date - start_date).days + 1\n",
        "\n",
        "def vectorize_dataset(df):\n",
        "    \"\"\"Takes the dataset and featurizes it.\"\"\"\n",
        "    users = set(df[\"user\"].values)\n",
        "    X = np.zeros((len(users), len(feature_map), time_horizon))\n",
        "    y = np.zeros((len(users)))\n",
        "    for index, user in enumerate(users):\n",
        "        x = vectorize_user_time_series(user, df)\n",
        "        X[index, :, :] = x\n",
        "        y[index] = int(user in threat_actors)\n",
        "    return X, y\n",
        "\n",
        "\n",
        "def date_to_index(date):\n",
        "    \"\"\"Indexes dates by counting the number of days since the starting date of the dataset.\"\"\"\n",
        "    return (date - start_date).days\n",
        "\n",
        "def extract_time_series_by_user(user_name, df):\n",
        "    \"\"\"Filters the dataframe down to a specific user.\"\"\"\n",
        "    return df[df[\"user\"] == user_name]\n",
        "\n",
        "\n",
        "def vectorize_user_time_series(user_name, df):\n",
        "    \"\"\"Convert the sequence of features of a user to a vector-valued time series.\"\"\"\n",
        "    user_time_series = extract_time_series_by_user(user_name, df)\n",
        "    x = np.zeros((len(feature_map), time_horizon))\n",
        "    event_date_indices = user_time_series[\"date\"].apply(date_to_index).to_numpy()\n",
        "    event_features = user_time_series[\"feature\"].to_numpy()\n",
        "    for i in range(len(event_date_indices)):\n",
        "        x[event_features[i], event_date_indices[i]] += 1\n",
        "    return x\n",
        "\n",
        "X, y = vectorize_dataset(joint)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tRG6CB2oIkoQ"
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)\n",
        "print(X_train.shape)\n",
        "print(y_train.shape)\n",
        "print(X_test.shape)\n",
        "print(y_test.shape)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3ueRnq-NIkoR"
      },
      "outputs": [],
      "source": [
        "X_train_reshaped = X_train.reshape(\n",
        "    [X_train.shape[0], X_train.shape[1] * X_train.shape[2]]\n",
        ")\n",
        "X_test_reshaped = X_test.reshape([X_test.shape[0], X_test.shape[1] * X_test.shape[2]])\n",
        "X_train_normal = X_train_reshaped[y_train == 0, :]\n",
        "print(X_train_normal.shape)\n",
        "X_train_threat = X_train_reshaped[y_train == 1, :]\n",
        "print(X_train_threat.shape)\n",
        "X_test_normal = X_test_reshaped[y_test == 0, :]\n",
        "print(X_test_normal.shape)\n",
        "X_test_threat = X_test_reshaped[y_test == 1, :]\n",
        "print(X_test_threat.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Choosing a model and hyper param\n",
        "\n",
        "observe training and testing results "
      ],
      "metadata": {
        "id": "W9HYW85jr3Fa"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lqaX9a8uIkoR"
      },
      "outputs": [],
      "source": [
        "from sklearn.ensemble import IsolationForest\n",
        "\n",
        "contamination_parameter = 0.035\n",
        "IF = IsolationForest(\n",
        "    n_estimators=100, max_samples=256, contamination=contamination_parameter\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Z8iBV42HIkoS"
      },
      "outputs": [],
      "source": [
        "IFIF.fit(X_train_reshaped)\n",
        "normal_scores = IF.decision_function(X_train_normal)\n",
        "\n",
        "import matplotlib.mlab as mlab\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
        "\n",
        "normal = plt.hist(normal_scores, 50, density=True)\n",
        "\n",
        "plt.xlim((-0.2, 0.2))\n",
        "plt.xlabel(\"Anomaly score\")\n",
        "plt.ylabel(\"Percentage\")\n",
        "plt.title(\"Distribution of anomaly score for non threats\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QJaY7j1hIkoS"
      },
      "outputs": [],
      "source": [
        "anomaly_scores = IF.decision_function(X_train_threat)\n",
        "\n",
        "fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
        "\n",
        "anomaly = plt.hist(anomaly_scores, 50, density=True)\n",
        "\n",
        "plt.xlim((-0.2, 0.2))\n",
        "plt.xlabel(\"Anomaly score\")\n",
        "plt.ylabel(\"Percentage\")\n",
        "plt.title(\"Distribution of anomaly score for threats\")"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Choose a threshold and evaluate results"
      ],
      "metadata": {
        "id": "4eiu7r3Irxjj"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "92CmOuspIkoS"
      },
      "outputs": [],
      "source": [
        "cutoff = 0.13 \n",
        "\n",
        "from collections import Counter\n",
        "\n",
        "s = IF.decision_function(X_train_reshaped)\n",
        "print(\"training rzlts\")\n",
        "print(Counter(y_train[cutoff > s]))\n",
        "\n",
        "s = IF.decision_function(X_test_reshaped)\n",
        "print(\"testing rzlts\")\n",
        "print(Counter(y_test[cutoff > s]))"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.8"
    },
    "colab": {
      "name": "Insider Threat.ipynb",
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ezzeldinadel/dfd0b1aba2736f9193a206c0445bc332/insider-threat.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# cyberdatascience.org \n",
	"\n",
	"---\n"
	],
	"metadata": {
	"id": "f8nP-yljtijQ"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"\n",
	"Data acquisition "
	],
	"metadata": {
	"id": "nszA9FQ_tci8"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "sgmPgtQ5IkoI"
	},
	"outputs": [],
	"source": [
	"import shutil\n",
	"import urllib.request as request\n",
	"from contextlib import closing\n",
	"\n",
	"with closing(request.urlopen('ftp://ftp.sei.cmu.edu/pub/cert-data/r4.2.tar.bz2')) as r:\n",
	" with open('r4.2.tar.bz2', 'wb') as f:\n",
	" shutil.copyfileobj(r, f)\n",
	"!bzip2 -d r4.2.tar.bz2\n",
	"!!tar xvf r4.2.tar"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "pn2Gss3rIkoJ"
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"path_to_dataset = \"./r4.2/\"\n",
	"log_types = [\"device\", \"email\", \"file\", \"logon\", \"http\"]\n",
	"log_fields_list = [\n",
	" [\"date\", \"user\", \"activity\"],\n",
	" [\"date\", \"user\", \"to\", \"cc\", \"bcc\"],\n",
	" [\"date\", \"user\", \"filename\"],\n",
	" [\"date\", \"user\", \"activity\"],\n",
	" [\"date\", \"user\", \"url\"],\n",
	"]\n",
	"features = 0\n",
	"feature_map = {}\n",
	"\n",
	"\n",
	"def add_feature(name):\n",
	" \"\"\"Add a feature to a dictionary to be encoded.\"\"\"\n",
	" if name not in feature_map:\n",
	" global features\n",
	" feature_map[name] = features\n",
	" features += 1"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Data Selection and Feature Engineering"
	],
	"metadata": {
	"id": "VauicJjMtsuB"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "r43uBcFTIkoL"
	},
	"outputs": [],
	"source": [
	"add_feature(\"Weekday_Logon_Normal\")\n",
	"add_feature(\"Weekday_Logon_After\")\n",
	"add_feature(\"Weekend_Logon\")\n",
	"add_feature(\"Logoff\")\n",
	"\n",
	"add_feature(\"Connect_Normal\")\n",
	"add_feature(\"Connect_After\")\n",
	"add_feature(\"Connect_Weekend\")\n",
	"add_feature(\"Disconnect\")\n",
	"\n",
	"add_feature(\"Email_In\")\n",
	"add_feature(\"Email_Out\")\n",
	"\n",
	"add_feature(\"File_exe\")\n",
	"add_feature(\"File_jpg\")\n",
	"add_feature(\"File_zip\")\n",
	"add_feature(\"File_txt\")\n",
	"add_feature(\"File_doc\")\n",
	"add_feature(\"File_pdf\")\n",
	"add_feature(\"File_other\")\n",
	"\n",
	"add_feature(\"url\")\n",
	"\n",
	"def file_features(row):\n",
	" \"\"\"Creates a feature recording the file extension of the file used.\"\"\"\n",
	" if row[\"filename\"].endswith(\".exe\"):\n",
	" return feature_map[\"File_exe\"]\n",
	" if row[\"filename\"].endswith(\".jpg\"):\n",
	" return feature_map[\"File_jpg\"]\n",
	" if row[\"filename\"].endswith(\".zip\"):\n",
	" return feature_map[\"File_zip\"]\n",
	" if row[\"filename\"].endswith(\".txt\"):\n",
	" return feature_map[\"File_txt\"]\n",
	" if row[\"filename\"].endswith(\".doc\"):\n",
	" return feature_map[\"File_doc\"]\n",
	" if row[\"filename\"].endswith(\".pdf\"):\n",
	" return feature_map[\"File_pdf\"]\n",
	" else:\n",
	" return feature_map[\"File_other\"]\n",
	"\n",
	"\n",
	"def email_features(row):\n",
	" \"\"\"Creates a feature recording whether an email has been sent externally.\"\"\"\n",
	" outsider = False\n",
	" if not pd.isnull(row[\"to\"]):\n",
	" for address in row[\"to\"].split(\";\"):\n",
	" if not address.endswith(\"dtaa.com\"):\n",
	" outsider = True\n",
	"\n",
	" if not pd.isnull(row[\"cc\"]):\n",
	" for address in row[\"cc\"].split(\";\"):\n",
	" if not address.endswith(\"dtaa.com\"):\n",
	" outsider = True\n",
	"\n",
	" if not pd.isnull(row[\"bcc\"]):\n",
	" for address in row[\"bcc\"].split(\";\"):\n",
	" if not address.endswith(\"dtaa.com\"):\n",
	" outsider = True\n",
	" if outsider:\n",
	" return feature_map[\"Email_Out\"]\n",
	" else:\n",
	" return feature_map[\"Email_In\"]\n",
	"\n",
	"\n",
	"def device_features(row):\n",
	" \"\"\"Creates a feature for whether the user has connected during normal hours or otherwise.\"\"\"\n",
	" if row[\"activity\"] == \"Connect\":\n",
	" if row[\"date\"].weekday() < 5:\n",
	" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
	" return feature_map[\"Connect_Normal\"]\n",
	" else:\n",
	" return feature_map[\"Connect_After\"]\n",
	" else:\n",
	" return feature_map[\"Connect_Weekend\"]\n",
	" else:\n",
	" return feature_map[\"Disconnect\"]\n",
	"\n",
	"\n",
	"def logon_features(row):\n",
	" \"\"\"Creates a feature for whether the user logged in during normal hours or otherwise.\"\"\"\n",
	" if row[\"activity\"] == \"Logon\":\n",
	" if row[\"date\"].weekday() < 5:\n",
	" if row[\"date\"].hour >= 8 and row[\"date\"].hour < 17:\n",
	" return feature_map[\"Weekday_Logon_Normal\"]\n",
	" else:\n",
	" return feature_map[\"Weekday_Logon_After\"]\n",
	" else:\n",
	" return feature_map[\"Weekend_Logon\"]\n",
	" else:\n",
	" return feature_map[\"Logoff\"]\n",
	"\n",
	"\n",
	"def http_features(row):\n",
	" \"\"\"Encodes the URL visited.\"\"\"\n",
	" return feature_map[\"url\"]\n",
	"\n",
	"\n",
	"def date_to_day(row):\n",
	" \"\"\"Converts a full datetime to date only.\"\"\"\n",
	" day_only = row[\"date\"].date()\n",
	" return day_only\n",
	"\n",
	"log_feature_functions = [\n",
	" device_features,\n",
	" email_features,\n",
	" file_features,\n",
	" logon_features,\n",
	" http_features,\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "Mudsnx9DIkoM"
	},
	"outputs": [],
	"source": [
	"dfs = []\n",
	"for i in range(len(log_types)):\n",
	" log_type = log_types[i]\n",
	" log_fields = log_fields_list[i]\n",
	" log_feature_function = log_feature_functions[i]\n",
	" df = pd.read_csv(\n",
	" path_to_dataset + log_type + \".csv\", usecols=log_fields, index_col=None\n",
	" )\n",
	" date_format = \"%m/%d/%Y %H:%M:%S\"\n",
	" df[\"date\"] = pd.to_datetime(df[\"date\"], format=date_format)\n",
	"\n",
	" new_feature = df.apply(log_feature_function, axis=1)\n",
	" df[\"feature\"] = new_feature\n",
	"\n",
	" cols_to_keep = [\"date\", \"user\", \"feature\"]\n",
	" df = df[cols_to_keep]\n",
	"\n",
	" df[\"date\"] = df.apply(date_to_day, axis=1)\n",
	"\n",
	" dfs.append(df)"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Data processing"
	],
	"metadata": {
	"id": "l_Q12WDitRSd"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "029-qTzDIkoO"
	},
	"outputs": [],
	"source": [
	"joint=pd.concat(dfs)\n",
	"joint=joint.sort_values(by=\"date\")\n",
	"threat_actors=[\"AAM0658\",\"AJR0932\",\"BDV0168\",\"BIH0745\",\"BLS0678\",\"BTL0226\",\"CAH0936\",\"DCH0843\",\"EHB0824\",\"EHD0584\",\"FMG0527\",\"FTM0406\",\"GHL0460\",\"HJB0742\",\"JMB0308\",\"JRG0207\",\"KLH0596\",\"KPC0073\",\"LJR0523\",\"LQC0479\",\"MAR0955\",\"MAS0025\",\"MCF0600\",\"MYD0978\",\"PPF0435\",\"RAB0589\",\"RGG0064\",\"RKD0604\",\"TAP0551\",\"WDD0366\",\"AAF0535\",\"ABC0174\",\"AKR0057\",\"CCL0068\",\"CEJ0109\",\"CQW0652\",\"DIB0285\",\"DRR0162\",\"EDB0714\",\"EGD0132\",\"FSC0601\",\"HBO0413\",\"HXL0968\",\"IJM0776\",\"IKR0401\",\"IUB0565\",\"JJM0203\",\"KRL0501\",\"LCC0819\",\"MDH0580\",\"MOS0047\",\"NWT0098\",\"PNL0301\",\"PSF0133\",\"RAR0725\",\"RHL0992\",\"RMW0542\",\"TNM0961\",\"VSS0154\",\"XHW0498\",\"BBS0039\",\"BSS0369\",\"CCA0046\",\"CSC0217\",\"GTD0219\",\"JGT0221\",\"JLM0364\",\"JTM0223\",\"MPM0220\",\"MSO0222\",]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "MwUrXBs3IkoQ"
	},
	"outputs": [],
	"source": [
	"start_date = joint[\"date\"].iloc[0]\n",
	"end_date = joint[\"date\"].iloc[-1]\n",
	"time_horizon = (end_date - start_date).days + 1\n",
	"\n",
	"def vectorize_dataset(df):\n",
	" \"\"\"Takes the dataset and featurizes it.\"\"\"\n",
	" users = set(df[\"user\"].values)\n",
	" X = np.zeros((len(users), len(feature_map), time_horizon))\n",
	" y = np.zeros((len(users)))\n",
	" for index, user in enumerate(users):\n",
	" x = vectorize_user_time_series(user, df)\n",
	" X[index, :, :] = x\n",
	" y[index] = int(user in threat_actors)\n",
	" return X, y\n",
	"\n",
	"\n",
	"def date_to_index(date):\n",
	" \"\"\"Indexes dates by counting the number of days since the starting date of the dataset.\"\"\"\n",
	" return (date - start_date).days\n",
	"\n",
	"def extract_time_series_by_user(user_name, df):\n",
	" \"\"\"Filters the dataframe down to a specific user.\"\"\"\n",
	" return df[df[\"user\"] == user_name]\n",
	"\n",
	"\n",
	"def vectorize_user_time_series(user_name, df):\n",
	" \"\"\"Convert the sequence of features of a user to a vector-valued time series.\"\"\"\n",
	" user_time_series = extract_time_series_by_user(user_name, df)\n",
	" x = np.zeros((len(feature_map), time_horizon))\n",
	" event_date_indices = user_time_series[\"date\"].apply(date_to_index).to_numpy()\n",
	" event_features = user_time_series[\"feature\"].to_numpy()\n",
	" for i in range(len(event_date_indices)):\n",
	" x[event_features[i], event_date_indices[i]] += 1\n",
	" return x\n",
	"\n",
	"X, y = vectorize_dataset(joint)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "tRG6CB2oIkoQ"
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)\n",
	"print(X_train.shape)\n",
	"print(y_train.shape)\n",
	"print(X_test.shape)\n",
	"print(y_test.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "3ueRnq-NIkoR"
	},
	"outputs": [],
	"source": [
	"X_train_reshaped = X_train.reshape(\n",
	" [X_train.shape[0], X_train.shape[1] * X_train.shape[2]]\n",
	")\n",
	"X_test_reshaped = X_test.reshape([X_test.shape[0], X_test.shape[1] * X_test.shape[2]])\n",
	"X_train_normal = X_train_reshaped[y_train == 0, :]\n",
	"print(X_train_normal.shape)\n",
	"X_train_threat = X_train_reshaped[y_train == 1, :]\n",
	"print(X_train_threat.shape)\n",
	"X_test_normal = X_test_reshaped[y_test == 0, :]\n",
	"print(X_test_normal.shape)\n",
	"X_test_threat = X_test_reshaped[y_test == 1, :]\n",
	"print(X_test_threat.shape)"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Choosing a model and hyper param\n",
	"\n",
	"observe training and testing results "
	],
	"metadata": {
	"id": "W9HYW85jr3Fa"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "lqaX9a8uIkoR"
	},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import IsolationForest\n",
	"\n",
	"contamination_parameter = 0.035\n",
	"IF = IsolationForest(\n",
	" n_estimators=100, max_samples=256, contamination=contamination_parameter\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "Z8iBV42HIkoS"
	},
	"outputs": [],
	"source": [
	"IFIF.fit(X_train_reshaped)\n",
	"normal_scores = IF.decision_function(X_train_normal)\n",
	"\n",
	"import matplotlib.mlab as mlab\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
	"\n",
	"normal = plt.hist(normal_scores, 50, density=True)\n",
	"\n",
	"plt.xlim((-0.2, 0.2))\n",
	"plt.xlabel(\"Anomaly score\")\n",
	"plt.ylabel(\"Percentage\")\n",
	"plt.title(\"Distribution of anomaly score for non threats\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "QJaY7j1hIkoS"
	},
	"outputs": [],
	"source": [
	"anomaly_scores = IF.decision_function(X_train_threat)\n",
	"\n",
	"fig = plt.figure(figsize=(8, 4), dpi=600, facecolor=\"w\", edgecolor=\"k\")\n",
	"\n",
	"anomaly = plt.hist(anomaly_scores, 50, density=True)\n",
	"\n",
	"plt.xlim((-0.2, 0.2))\n",
	"plt.xlabel(\"Anomaly score\")\n",
	"plt.ylabel(\"Percentage\")\n",
	"plt.title(\"Distribution of anomaly score for threats\")"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Choose a threshold and evaluate results"
	],
	"metadata": {
	"id": "4eiu7r3Irxjj"
	}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "92CmOuspIkoS"
	},
	"outputs": [],
	"source": [
	"cutoff = 0.13 \n",
	"\n",
	"from collections import Counter\n",
	"\n",
	"s = IF.decision_function(X_train_reshaped)\n",
	"print(\"training rzlts\")\n",
	"print(Counter(y_train[cutoff > s]))\n",
	"\n",
	"s = IF.decision_function(X_test_reshaped)\n",
	"print(\"testing rzlts\")\n",
	"print(Counter(y_test[cutoff > s]))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	},
	"colab": {
	"name": "Insider Threat.ipynb",
	"provenance": [],
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}