Skip to content

Instantly share code, notes, and snippets.

@yongkangc
Created April 30, 2020 07:48
Show Gist options
  • Save yongkangc/f81136fcd077437664c1e8c2bbda53e1 to your computer and use it in GitHub Desktop.
Save yongkangc/f81136fcd077437664c1e8c2bbda53e1 to your computer and use it in GitHub Desktop.
Hierachical.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Hierachical.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ExtremelySunnyYK/f81136fcd077437664c1e8c2bbda53e1/hierachical.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "1OdOV6ou7sZr",
"colab_type": "code",
"colab": {}
},
"source": [
"# Data Analytics mods\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import itertools\n",
"import pprint\n",
"from collections import Counter\n",
"import re\n",
"import operator\n",
"\n",
"\n",
"\n",
"# NLP Modules\n",
"import gensim\n",
"from gensim.models import LdaModel, LdaMulticore\n",
"from gensim.test.utils import common_texts\n",
"from gensim.corpora import Dictionary\n",
"from gensim.models import Phrases\n",
"from gensim.test.utils import datapath, get_tmpfile\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score\n",
"\n",
"# import pyLDAvis.gensim\n",
"import warnings\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from gensim.models import TfidfModel\n",
"from gensim.similarities import Similarity\n",
"\n",
"\n",
"\n",
"def parse_input(text):\n",
" return text.strip(\"\\n\").strip(\" \").strip(\"b\")\n",
"\n",
"def parse_http(text):\n",
" return text.strip(\"\\n\").strip(\" \").strip(\"b\").strip(\"'\").strip(\"r\")\n",
"\n",
"def tokenize_hex(text):\n",
" # re.split(r'\\\\x'+'\\\\',text)\n",
" return text.split(\"\\\\\")\n",
" \n",
"def tokenize_ascii(text):\n",
" return re.split(r\"[^a-zA-Z0-9 |. |:]\",text)\n",
"\n",
"def is_hex(text):\n",
" # if any([x in text for x in [\"\\\\\",\"/\",\"'\",\"\"]]):\n",
" # return False \n",
" return text != \"\\'\"\n",
"\n",
"\n",
"def parse_hex(text):\n",
" return text.strip(\"x\")\n",
"\n",
"def header_lim(msg):\n",
" \"\"\"Limiting the header to 70 bytes\n",
" \"\"\"\n",
" if len(msg) <= 70:\n",
" return msg\n",
" else:\n",
" return msg[:70]\n",
" \n",
"\n",
"### CHANGE here #####\n",
"\n",
"# Lower Accuracy Version\n",
"# def msg_to_bytes():\n",
"# \"\"\" Breaking Messages into bytes \n",
"# Returns a list of Messages\n",
"# \"\"\"\n",
"# # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
"# f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
"# # print(sent_tokenize(text))\n",
"# text = f.readlines()\n",
"# doc = []\n",
"# for line in text:\n",
"# parsed_hex = []\n",
"# if \"\\\\x\" in line:\n",
"# line = parse_input(line)\n",
"# tokenized_hex = tokenize_hex(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# parsed_hex.append(parse_hex(token))\n",
"\n",
"# # limiting the header to 70 bytes\n",
"# # doc.append(header_lim(parsed_hex))\n",
"# doc.append((parsed_hex))\n",
"\n",
"# elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
"# line = parse_input(line)\n",
"# # tokenized_hex = tokenize_hex(line)\n",
"# tokenized_hex = tokenize_ascii(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
"# parsed_hex.append(parse_hex(token))\n",
"# doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
"# return doc\n",
"\n",
"\n",
"\n",
"# def msg_to_bytes():\n",
"# \"\"\" Breaking Messages into bytes \n",
"# Returns a list of Messages\n",
"# \"\"\"\n",
"# # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
"# f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
"# # print(sent_tokenize(text))\n",
"# text = f.readlines()\n",
"# doc = []\n",
"# for line in text:\n",
"# parsed_hex = []\n",
"# if \"\\\\x\" in line:\n",
"# line = parse_input(line)\n",
"# tokenized_hex = tokenize_hex(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# parsed_hex.append(parse_hex(token))\n",
"\n",
"# # limiting the header to 70 bytes\n",
"# # doc.append(header_lim(parsed_hex))\n",
"# doc.append((parsed_hex))\n",
"\n",
"# elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
"# line = parse_input(line)\n",
"# # tokenized_hex = tokenize_hex(line)\n",
"# # tokenized_hex = tokenize_ascii(line)\n",
"# for token in line:\n",
"# if is_hex(token):\n",
"# if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
"# parsed_hex.append(parse_hex(token))\n",
"# doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
"# return doc\n",
"\n",
"def msg_to_bytes():\n",
" \"\"\" Breaking Messages into bytes \n",
" Returns a list of Messages\n",
" \"\"\"\n",
" # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
" f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
" # print(sent_tokenize(text))\n",
" text = f.readlines()\n",
" doc = []\n",
" for line in text:\n",
" parsed_hex = []\n",
" if \"\\\\x\" in line:\n",
" line = parse_input(line)\n",
" tokenized_hex = tokenize_hex(line)\n",
" for token in tokenized_hex:\n",
" if is_hex(token):\n",
" parsed_hex.append(parse_hex(token))\n",
"\n",
" # limiting the header to 70 bytes\n",
" # doc.append(header_lim(parsed_hex))\n",
" doc.append((parsed_hex))\n",
"\n",
" elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
" line = parse_input(line)\n",
" # tokenized_hex = tokenize_hex(line)\n",
" # tokenized_hex = tokenize_ascii(line)\n",
" for token in line:\n",
" if is_hex(token):\n",
" if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
" parsed_hex.append(parse_hex(token))\n",
" doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
" return doc\n",
"\n",
"\n",
"def n_gram(docs):\n",
" # Add bigrams and trigrams to docs (only ones that appear 10 times or more).\n",
" bigram = Phrases(docs, min_count=10)\n",
" # trigram = Phrases(bigram[docs])\n",
" docs = [bigram[line] for line in docs]\n",
"\n",
" # for idx in range(len(docs)):\n",
" # for token in bigram[docs[idx]]:\n",
" # if '_' in token:\n",
" # # Token is a bigram, add to document.\n",
" # docs[idx].append(token)\n",
" # for token in trigram[docs[idx]]:\n",
" # if '_' in token:\n",
" # # Token is a bigram, add to document.\n",
" # docs[idx].append(token)\n",
" return docs\n",
"\n",
"\n",
"def filter_tokens(dictionary):\n",
" \"\"\" Filter out words that occur less than \"no_below\" documents, or more than \"no_above\" of the documents.\n",
" Returns dictionary with filtered tokens\"\"\"\n",
" no_below = 10\n",
" no_above = 0.2\n",
" dictionary.filter_extremes(no_below=no_below, no_above=no_above)\n",
"\n",
" return dictionary\n",
"\n",
"\n",
"def create_dict(docs):\n",
" \"\"\" Create a dictionary representation of the documents.\"\"\"\n",
" # Create a dictionary representation of the documents.\n",
" dictionary = Dictionary(docs)\n",
" return dictionary\n",
"\n",
"\n",
"def create_corpus(docs):\n",
" \"\"\"Returns a TF/IDF Weighted corpus\"\"\"\n",
" # Create a dictionary representation of the documents.\n",
" dictionary = Dictionary(docs)\n",
" # Create a dictionary representation of the documents.\n",
" # Bag-of-words representation of the documents.\n",
" corpus = [dictionary.doc2bow(doc) for doc in docs] # output (ID:frequency)\n",
" # Using Tf-Idf\n",
" corpus_tfidf = tf_idf(corpus) # Gensim object\n",
" return corpus_tfidf\n",
"def tf_idf(corpus):\n",
" \"\"\"Using TF/IDF to vectorize the data\n",
" Returns tfidf weighted corpus\"\"\"\n",
" tfidf = TfidfModel(corpus) # fit model\n",
" # tfidf = [model[corpus[i]] for i in range(len(corpus))]\n",
" corpus_tfidf = tfidf[corpus]\n",
" return corpus_tfidf\n",
"\n",
"def similarity_matrix(corpus, dictionary):\n",
" \"\"\"Compute cosine similarity against a corpus of documents by storing the index matrix in memory.\"\"\"\n",
" # index = MatrixSimilarity(corpus, num_features=len(dictionary))\n",
" index_temp = get_tmpfile(\"index\")\n",
" index = Similarity(index_temp, corpus, num_features=len(dictionary)) # create index\n",
" for sims in index[corpus]:\n",
" pprint(sims)\n",
"def visualise_LDA(lda_model, corpus, dictionary):\n",
" \"\"\"Visualise the LDA results\"\"\"\n",
" warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
" visualisation = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)\n",
" pyLDAvis.save_html(visualisation, 'LDA_Visualisation.html')\n",
"\n",
"def majority_element(arr):\n",
" \"\"\"Returns the majority value in the array.\n",
" Implemented using Boyer–Moore majority vote algorithm\"\"\"\n",
"\n",
" counter, possible_element = 0, None\n",
" for i in arr:\n",
" if counter == 0:\n",
" possible_element, counter = i, 1\n",
" elif i == possible_element:\n",
" counter += 1\n",
" else:\n",
" counter -= 1\n",
"\n",
" return possible_element\n",
"\n",
"def write_result(lda_model,avg_topic_coherence,topic_dist):\n",
" \"\"\"Create a text document of the result\"\"\"\n",
" with open(\"result4.txt\", \"w\") as f:\n",
" # pprint(topic_dist, stream=f)\n",
" print(topic_dist, file=f)\n",
" print('Average topic coherence: %.4f.' % avg_topic_coherence,file=f)\n",
"\n",
"### CHANGE here #####\n",
"\n",
"def normalise_pred(arr,true_dict,pred_dict):\n",
" \"\"\" Finding the weighted average of the message type\n",
" Returns the highest probability message type.\n",
" \"\"\"\n",
"\n",
" fraction_array = []\n",
" for i in arr:\n",
" if i in true_dict:\n",
" fraction = pred_dict[i] / true_dict[i]\n",
" fraction_array.append(fraction)\n",
" else:\n",
" print(\"no similarities for {}\".format(i))\n",
" print(fraction_array)\n",
" index, value = max(enumerate(fraction_array), key=operator.itemgetter(1))\n",
"\n",
" return arr[index]\n",
"\n",
"\n",
"def count_element(array):\n",
" \"\"\"Counts the unique message types in list\n",
" Returns Dictionary of type : times\n",
" \"\"\"\n",
" unique_elements = list(Counter(array).keys())\n",
" element_frequency = list(Counter(array).values())\n",
"\n",
" dict = {}\n",
"\n",
" for index,key in enumerate(unique_elements):\n",
" dict[key] = element_frequency[index]\n",
"\n",
" return dict\n",
"\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zM4Xhrkh8Bdk",
"colab_type": "text"
},
"source": [
"# Setup for hierachical clustering"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_iZUpvrU7-L2",
"colab_type": "code",
"colab": {}
},
"source": [
"import pickle\n",
"\n",
"\n",
"with open(\"/content/drive/My Drive/DSO Presentation/Models/docs.txt\", \"rb\") as fp:\n",
" docs_ori = pickle.load(fp)\n",
"\n",
"with open(\"/content/drive/My Drive/DSO Presentation/Models/y_true.txt\", \"rb\") as fp:\n",
" y_ori =pickle.load(fp)\n",
"\n",
"# with open(\"/content/drive/My Drive/DSO Presentation/Models/y_pred.txt\", \"rb\") as fp:\n",
"# y_pred = pickle.load(fp)\n",
"\n",
"Y_labels = pd.read_csv (r'/content/drive/My Drive/DSO Presentation/Models/labels.csv')\n",
"# X_dist = pd.read_csv (r'/content/drive/My Drive/DSO Presentation/Models/topic_dist.csv')\n",
"\n",
"Y_labels.sort_values('pred_labels')\n",
"\n",
"# Finding Cluster\n",
"# cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 0]\n",
"# cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 2]\n",
"cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 4]\n",
"\n",
"# # Counting values of cluster\n",
"# print(cluster_zero['true_labels'].value_counts())\n",
"# print(cluster_two['true_labels'].value_counts())\n",
"# print(cluster_four['true_labels'].value_counts())\n",
"\n",
"# # Getting Index of cluster to list\n",
"# cluster_zero.index.values.tolist()\n",
"# cluster_two.index.values.tolist()\n",
"# cluster_four.index.values.tolist()\n",
"\n",
"\n",
"# Getting msg_type for sub cluster\n",
"# Change values here\n",
"sub_index = cluster_zero.index.values.tolist()\n",
"msg_type= []\n",
"for index in sub_index:\n",
" index_msg = y_ori[index]\n",
" msg_type.append(index_msg)\n",
"\n",
"# Getting Docs for sub cluster\n",
"docs = []\n",
"for index in sub_index:\n",
" index_docs = docs_ori[index]\n",
" docs.append(index_docs)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "njQa79US_nbu",
"colab_type": "text"
},
"source": [
"# LDA"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3KEY5clkqlP4",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "e310ea97-7dee-4efa-d2cb-13e66a763548"
},
"source": [
"\"\"\" Clusters the message type using Latent Dirichlet Allocation\"\"\"\n",
"true_dict = count_element(msg_type)\n",
"docs = n_gram(docs)\n",
"dictionary = create_dict(docs)\n",
"corpus = create_corpus(docs)\n",
"\n",
"# Set training parameters.\n",
"num_topics = 5\n",
"chunksize = 1 # how many documents are processed at a time\n",
"passes = 50 # how often we train the model on the entire corpus.\n",
"iterations = 1000\n",
"eval_every = 1 # For logging\n",
"minimum_probability = 0.0\n",
"n_clusters = 5\n",
"\n",
"\n",
"# Make a index to word dictionary.\n",
"temp = dictionary[0] # initialize the dictionary\n",
"id2word = dictionary.id2token\n",
"\n",
"# Train the model on the corpus.\n",
"lda_model = LdaModel(\n",
" corpus=corpus,\n",
" id2word=id2word,\n",
" chunksize=chunksize,\n",
" alpha='auto',\n",
" eta='auto',\n",
" iterations=iterations,\n",
" num_topics=num_topics,\n",
" passes=passes,\n",
" eval_every=eval_every,\n",
")\n",
"\n",
"# # Train a multicore LDA model\n",
"# lda_model = LdaMulticore(\n",
"# corpus=corpus,\n",
"# id2word=id2word,\n",
"# chunksize=chunksize,\n",
"# alpha='auto',\n",
"# eta='auto',\n",
"# iterations=iterations,\n",
"# num_topics=num_topics,\n",
"# passes=passes,\n",
"# eval_every=eval_every,\n",
"# minimum_probability=0.0,\n",
"# workers=1,\n",
"# )\n",
"temp_file = datapath(\"model\")\n",
"lda_model.save(temp_file) # saving the model in \"tempfile\"\n",
"\n",
"top_topics = lda_model.top_topics(corpus)\n",
"# Get topic distribution and forms a list\n",
"topic_dist = [lda_model.get_document_topics(item,minimum_probability=0.0) for item in corpus]\n",
"# sm = similarity_matrix(corpus, dictionary)\n",
"\n",
"\n",
"\n",
"# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics."
],
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/gensim/models/phrases.py:598: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class\n",
" warnings.warn(\"For a faster implementation, use the gensim.models.phrases.Phraser class\")\n",
"/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-Eg4FizB2plH",
"colab_type": "text"
},
"source": [
"# K Means"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4puKE1D_tISJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 390
},
"outputId": "06b732f5-1b6c-439d-8a15-e8cdcb284ca5"
},
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"\n",
"topic_dist = [lda_model.get_document_topics(item, minimum_probability=0.0) for item in corpus]\n",
"X = pd.DataFrame(topic_dist) # Dataframe of the result. Use Jupyter notebook to view.\n",
"\n",
"entry_num = 1 # index one -> the probability of message in topics\n",
"\n",
"# Removing the id from the tuple, leaving the probablity of each word being in topic\n",
"for row in X.iterrows():\n",
" for i in range(0, num_topics):\n",
" row[entry_num][i] = row[entry_num][i][1]\n",
"\n",
"# Setting Parameters\n",
"n_init = 10\n",
"\n",
"# Using PCA with Kmeans\n",
"# PCA first to reduce dimensionality for visualisation\n",
"pca = PCA(n_components=2)\n",
"PC = pca.fit_transform(X)\n",
"\n",
"# Applying Kmeans to get labels(cluster no)\n",
"kmeans = KMeans(n_clusters=n_clusters, n_init=n_init).fit_predict(PC)\n",
"\n",
"# # Using Kmeans only\n",
"# kmeans = KMeans(n_clusters=num_topics, n_init=10).fit_predict(X)\n",
"\n",
"\n",
"# Dataframe with labels\n",
"Y = pd.DataFrame()\n",
"Y[\"true_labels\"] = msg_type\n",
"cluster_predicted = kmeans.tolist()\n",
"Y[\"pred_labels\"] = cluster_predicted\n",
"Y.groupby(\"pred_labels\")\n",
"Y[\"pred_labels\"] = cluster_predicted\n",
"Y.groupby(\"pred_labels\")\n",
"clustered_labels = {}\n",
"for (i,row) in Y.iterrows():\n",
" if row[\"pred_labels\"] in clustered_labels:\n",
" clustered_labels[row[\"pred_labels\"]].append(row[\"true_labels\"])\n",
" else:\n",
" clustered_labels[row[\"pred_labels\"]] = [row[\"true_labels\"]]\n",
"\n",
"y_pred = []\n",
"for i in clustered_labels:\n",
" # Labelling the predicted cluster\n",
" pred_dict = count_element(clustered_labels[i]) \n",
" maj = normalise_pred(clustered_labels[i],true_dict,pred_dict) ### THREERER IS A MISTAKE HERE!!\n",
" # maj = majority_element(clustered_labels[i])\n",
" cluster_maj = [maj for i in range(len(clustered_labels[i]))]\n",
" # print(cluster_predicted)\n",
" y_pred.extend(cluster_maj) # Adding to the list of predicted labels for cluster\n",
"\n",
"y_true = []\n",
"for i in clustered_labels:\n",
" y_true.extend(clustered_labels[i])\n",
"\n",
"fig,ax = plt.subplots()\n",
"\n",
"# Plotting Kmeans\n",
"# Iterating through no of categories\n",
"for i in np.unique(kmeans):\n",
" plotx = []\n",
" ploty = []\n",
" for j in range(PC.shape[0]):\n",
" if kmeans[j] == i:\n",
" plotx.append(PC[j][0])\n",
" ploty.append(PC[j][1])\n",
"\n",
" # Plotting the graph\n",
" plt.scatter(plotx, ploty, label=i) # projected points to the axis\n",
"\n",
"ax.legend()\n",
"\n"
],
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": [
"[0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.14285714285714285, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.2857142857142857, 0.2857142857142857, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[1.0]\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x7f53a5968a90>"
]
},
"metadata": {
"tags": []
},
"execution_count": 26
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Cj2A9B8l7TWe",
"colab_type": "text"
},
"source": [
"K means Elbow Method"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4fsXwYID7SdJ",
"colab_type": "code",
"colab": {}
},
"source": [
"# sse = {}\n",
"# for k in range(1, 10):\n",
"# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(X)\n",
"# X[\"clusters\"] = kmeans.labels_\n",
"# #print(data[\"clusters\"])\n",
"# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center\n",
"# plt.figure()\n",
"# plt.plot(list(sse.keys()), list(sse.values()))\n",
"# plt.xlabel(\"Number of cluster\")\n",
"# plt.ylabel(\"SSE\")\n",
"# plt.show()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "TVjCRwA02cby",
"colab_type": "text"
},
"source": [
"# Metrics"
]
},
{
"cell_type": "code",
"metadata": {
"id": "uDTAiEVxWXFJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"outputId": "34f1ac3e-3a27-407e-9525-90b3039dc3dc"
},
"source": [
"\n",
"# Finding the unique values in the truth. This will tell us number of unique clusters\n",
"unique_types_true = np.unique(np.array(y_true))\n",
"unique_clusters_true = len(unique_types_true) # number of unique clusters true\n",
"cluster_no = len(np.unique(np.array(y_pred)))# Number of unique clusters predicted\n",
"accuracy = accuracy_score(y_true, y_pred) # Calculating accuracy score\n",
"\n",
"print(\"Number of Message types : {}\".format(unique_clusters_true))\n",
"print(\"Number of Clusters : {}\".format(unique_clusters_true))\n",
"print(\"Number of Clusters predicted : {}\".format(cluster_no))\n",
"print(\"Percentage Accuracy in predicted cluster : {:.2%} \".format(accuracy))\n",
"\n",
"\n",
"class_labels = list(set(y_true)) # Creating a list of unqiue labels\n",
"cm = confusion_matrix(y_true, y_pred, labels=class_labels) # Creating a confusion matrix from y_true and y_pred\n",
"\n",
"# Calculating precision and recall\n",
"# Using micro average as there might be a class imbalance (i.e more examples of one class than another)\n",
"metric_score_micro = precision_recall_fscore_support(y_true, y_pred, average=\"micro\")\n",
"print(\"Precision Score is {:.2f}\".format(metric_score_micro[0]))\n",
"print(\"Recall Score is {:.2f}\".format(metric_score_micro[1]))\n",
"print(\"F Score is {:.2f}\".format(metric_score_micro[2]))\n",
"\n",
"plt.imshow(cm, cmap=plt.cm.Blues, interpolation='nearest')\n",
"plt.colorbar()\n",
"plt.title('Confusion Matrix without Normalization')\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"tick_marks = np.arange(len(set(y_true))) # length of classes\n",
"\n",
"# tick_marks\n",
"plt.xticks(tick_marks, class_labels, fontsize=6)\n",
"plt.yticks(tick_marks, class_labels, fontsize=7)\n",
"\n",
"# plotting text value inside cells\n",
"thresh = cm.max() / 2.\n",
"for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
" plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment='center',\n",
" color='white' if cm[i, j] > thresh else 'black')\n",
" \n",
"plt.show() # Plots the confusion matrix\n"
],
"execution_count": 28,
"outputs": [
{
"output_type": "stream",
"text": [
"Number of Message types : 4\n",
"Number of Clusters : 4\n",
"Number of Clusters predicted : 4\n",
"Percentage Accuracy in predicted cluster : 73.33% \n",
"Precision Score is 0.73\n",
"Recall Score is 0.73\n",
"F Score is 0.73\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "g2YAsoRWtW9S",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87
},
"outputId": "fc6a4772-77ea-49c7-a17e-a733d330c5fe"
},
"source": [
" print(\"y pred :\" + str(y_pred))\n",
" print(\"y true : \" + str(y_true))\n",
" print(clustered_labels)"
],
"execution_count": 29,
"outputs": [
{
"output_type": "stream",
"text": [
"y pred :['TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'HTTP']\n",
"y true : ['TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'TCP', 'TCP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'HTTP']\n",
"{2: ['TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP'], 1: ['TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP'], 3: ['TCP', 'TCP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP'], 0: ['UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP'], 4: ['HTTP']}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PgW-LgTPSCfV",
"colab_type": "text"
},
"source": [
"# Script to get Message Type Resolution\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "30_-4DxJsfys",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 403
},
"outputId": "4e8550bb-ce25-4b19-a711-b399ecc4b88f"
},
"source": [
"for cluster, msg_type in sorted(clustered_labels.items()):\n",
" values, counts = np.unique(msg_type, return_counts=True)\n",
" print(\"\\n\\nCluster {} : {} / {} \".format(cluster,len(msg_type),len(y_true)))\n",
" for i in range(len(values)):\n",
" print(\"{} : {} / {} = {:.2%}\".format(values[i],counts[i],len(msg_type),counts[i]/len(msg_type)))\n",
"\n",
"\n",
"\n"
],
"execution_count": 30,
"outputs": [
{
"output_type": "stream",
"text": [
"\n",
"\n",
"Cluster 0 : 6 / 30 \n",
"UDP : 6 / 6 = 100.00%\n",
"\n",
"\n",
"Cluster 1 : 6 / 30 \n",
"ICMP : 5 / 6 = 83.33%\n",
"TCP : 1 / 6 = 16.67%\n",
"\n",
"\n",
"Cluster 2 : 9 / 30 \n",
"ICMP : 5 / 9 = 55.56%\n",
"TCP : 4 / 9 = 44.44%\n",
"\n",
"\n",
"Cluster 3 : 8 / 30 \n",
"TCP : 2 / 8 = 25.00%\n",
"UDP : 6 / 8 = 75.00%\n",
"\n",
"\n",
"Cluster 4 : 1 / 30 \n",
"HTTP : 1 / 1 = 100.00%\n"
],
"name": "stdout"
}
]
}
]
}
@yongkangc
Copy link
Author

Message Clustering during internship using LDA Kmeans

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment