Skip to content

Instantly share code, notes, and snippets.

@yongkangc
Created April 30, 2020 07:48
Show Gist options
  • Save yongkangc/f81136fcd077437664c1e8c2bbda53e1 to your computer and use it in GitHub Desktop.
Save yongkangc/f81136fcd077437664c1e8c2bbda53e1 to your computer and use it in GitHub Desktop.
Hierachical.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Hierachical.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ExtremelySunnyYK/f81136fcd077437664c1e8c2bbda53e1/hierachical.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "1OdOV6ou7sZr",
"colab_type": "code",
"colab": {}
},
"source": [
"# Data Analytics mods\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import itertools\n",
"import pprint\n",
"from collections import Counter\n",
"import re\n",
"import operator\n",
"\n",
"\n",
"\n",
"# NLP Modules\n",
"import gensim\n",
"from gensim.models import LdaModel, LdaMulticore\n",
"from gensim.test.utils import common_texts\n",
"from gensim.corpora import Dictionary\n",
"from gensim.models import Phrases\n",
"from gensim.test.utils import datapath, get_tmpfile\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score\n",
"\n",
"# import pyLDAvis.gensim\n",
"import warnings\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from gensim.models import TfidfModel\n",
"from gensim.similarities import Similarity\n",
"\n",
"\n",
"\n",
"def parse_input(text):\n",
" return text.strip(\"\\n\").strip(\" \").strip(\"b\")\n",
"\n",
"def parse_http(text):\n",
" return text.strip(\"\\n\").strip(\" \").strip(\"b\").strip(\"'\").strip(\"r\")\n",
"\n",
"def tokenize_hex(text):\n",
" # re.split(r'\\\\x'+'\\\\',text)\n",
" return text.split(\"\\\\\")\n",
" \n",
"def tokenize_ascii(text):\n",
" return re.split(r\"[^a-zA-Z0-9 |. |:]\",text)\n",
"\n",
"def is_hex(text):\n",
" # if any([x in text for x in [\"\\\\\",\"/\",\"'\",\"\"]]):\n",
" # return False \n",
" return text != \"\\'\"\n",
"\n",
"\n",
"def parse_hex(text):\n",
" return text.strip(\"x\")\n",
"\n",
"def header_lim(msg):\n",
" \"\"\"Limiting the header to 70 bytes\n",
" \"\"\"\n",
" if len(msg) <= 70:\n",
" return msg\n",
" else:\n",
" return msg[:70]\n",
" \n",
"\n",
"### CHANGE here #####\n",
"\n",
"# Lower Accuracy Version\n",
"# def msg_to_bytes():\n",
"# \"\"\" Breaking Messages into bytes \n",
"# Returns a list of Messages\n",
"# \"\"\"\n",
"# # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
"# f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
"# # print(sent_tokenize(text))\n",
"# text = f.readlines()\n",
"# doc = []\n",
"# for line in text:\n",
"# parsed_hex = []\n",
"# if \"\\\\x\" in line:\n",
"# line = parse_input(line)\n",
"# tokenized_hex = tokenize_hex(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# parsed_hex.append(parse_hex(token))\n",
"\n",
"# # limiting the header to 70 bytes\n",
"# # doc.append(header_lim(parsed_hex))\n",
"# doc.append((parsed_hex))\n",
"\n",
"# elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
"# line = parse_input(line)\n",
"# # tokenized_hex = tokenize_hex(line)\n",
"# tokenized_hex = tokenize_ascii(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
"# parsed_hex.append(parse_hex(token))\n",
"# doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
"# return doc\n",
"\n",
"\n",
"\n",
"# def msg_to_bytes():\n",
"# \"\"\" Breaking Messages into bytes \n",
"# Returns a list of Messages\n",
"# \"\"\"\n",
"# # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
"# f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
"# # print(sent_tokenize(text))\n",
"# text = f.readlines()\n",
"# doc = []\n",
"# for line in text:\n",
"# parsed_hex = []\n",
"# if \"\\\\x\" in line:\n",
"# line = parse_input(line)\n",
"# tokenized_hex = tokenize_hex(line)\n",
"# for token in tokenized_hex:\n",
"# if is_hex(token):\n",
"# parsed_hex.append(parse_hex(token))\n",
"\n",
"# # limiting the header to 70 bytes\n",
"# # doc.append(header_lim(parsed_hex))\n",
"# doc.append((parsed_hex))\n",
"\n",
"# elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
"# line = parse_input(line)\n",
"# # tokenized_hex = tokenize_hex(line)\n",
"# # tokenized_hex = tokenize_ascii(line)\n",
"# for token in line:\n",
"# if is_hex(token):\n",
"# if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
"# parsed_hex.append(parse_hex(token))\n",
"# doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
"# return doc\n",
"\n",
"def msg_to_bytes():\n",
" \"\"\" Breaking Messages into bytes \n",
" Returns a list of Messages\n",
" \"\"\"\n",
" # f = open(\"/content/drive/My Drive/DSO Presentation/dataset/tcp_icmp_udp.txt\", \"r\")\n",
" f = open(\"/content/drive/My Drive/DSO Presentation/dataset/TCP_ICMP_UDP_HTTP.txt\", \"r\")\n",
" # print(sent_tokenize(text))\n",
" text = f.readlines()\n",
" doc = []\n",
" for line in text:\n",
" parsed_hex = []\n",
" if \"\\\\x\" in line:\n",
" line = parse_input(line)\n",
" tokenized_hex = tokenize_hex(line)\n",
" for token in tokenized_hex:\n",
" if is_hex(token):\n",
" parsed_hex.append(parse_hex(token))\n",
"\n",
" # limiting the header to 70 bytes\n",
" # doc.append(header_lim(parsed_hex))\n",
" doc.append((parsed_hex))\n",
"\n",
" elif any(x in line for x in [\"GET\",\"HTTP\"]):\n",
" line = parse_input(line)\n",
" # tokenized_hex = tokenize_hex(line)\n",
" # tokenized_hex = tokenize_ascii(line)\n",
" for token in line:\n",
" if is_hex(token):\n",
" if not any(substring in token for substring in [\" \",\"'\",\"\\r\",\"\\n\",\"\",\"r\",\"n\"]):\n",
" parsed_hex.append(parse_hex(token))\n",
" doc.append(header_lim(parsed_hex))\n",
"\n",
"\n",
" return doc\n",
"\n",
"\n",
"def n_gram(docs):\n",
" # Add bigrams and trigrams to docs (only ones that appear 10 times or more).\n",
" bigram = Phrases(docs, min_count=10)\n",
" # trigram = Phrases(bigram[docs])\n",
" docs = [bigram[line] for line in docs]\n",
"\n",
" # for idx in range(len(docs)):\n",
" # for token in bigram[docs[idx]]:\n",
" # if '_' in token:\n",
" # # Token is a bigram, add to document.\n",
" # docs[idx].append(token)\n",
" # for token in trigram[docs[idx]]:\n",
" # if '_' in token:\n",
" # # Token is a bigram, add to document.\n",
" # docs[idx].append(token)\n",
" return docs\n",
"\n",
"\n",
"def filter_tokens(dictionary):\n",
" \"\"\" Filter out words that occur less than \"no_below\" documents, or more than \"no_above\" of the documents.\n",
" Returns dictionary with filtered tokens\"\"\"\n",
" no_below = 10\n",
" no_above = 0.2\n",
" dictionary.filter_extremes(no_below=no_below, no_above=no_above)\n",
"\n",
" return dictionary\n",
"\n",
"\n",
"def create_dict(docs):\n",
" \"\"\" Create a dictionary representation of the documents.\"\"\"\n",
" # Create a dictionary representation of the documents.\n",
" dictionary = Dictionary(docs)\n",
" return dictionary\n",
"\n",
"\n",
"def create_corpus(docs):\n",
" \"\"\"Returns a TF/IDF Weighted corpus\"\"\"\n",
" # Create a dictionary representation of the documents.\n",
" dictionary = Dictionary(docs)\n",
" # Create a dictionary representation of the documents.\n",
" # Bag-of-words representation of the documents.\n",
" corpus = [dictionary.doc2bow(doc) for doc in docs] # output (ID:frequency)\n",
" # Using Tf-Idf\n",
" corpus_tfidf = tf_idf(corpus) # Gensim object\n",
" return corpus_tfidf\n",
"def tf_idf(corpus):\n",
" \"\"\"Using TF/IDF to vectorize the data\n",
" Returns tfidf weighted corpus\"\"\"\n",
" tfidf = TfidfModel(corpus) # fit model\n",
" # tfidf = [model[corpus[i]] for i in range(len(corpus))]\n",
" corpus_tfidf = tfidf[corpus]\n",
" return corpus_tfidf\n",
"\n",
"def similarity_matrix(corpus, dictionary):\n",
" \"\"\"Compute cosine similarity against a corpus of documents by storing the index matrix in memory.\"\"\"\n",
" # index = MatrixSimilarity(corpus, num_features=len(dictionary))\n",
" index_temp = get_tmpfile(\"index\")\n",
" index = Similarity(index_temp, corpus, num_features=len(dictionary)) # create index\n",
" for sims in index[corpus]:\n",
" pprint(sims)\n",
"def visualise_LDA(lda_model, corpus, dictionary):\n",
" \"\"\"Visualise the LDA results\"\"\"\n",
" warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
" visualisation = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)\n",
" pyLDAvis.save_html(visualisation, 'LDA_Visualisation.html')\n",
"\n",
"def majority_element(arr):\n",
" \"\"\"Returns the majority value in the array.\n",
" Implemented using Boyer–Moore majority vote algorithm\"\"\"\n",
"\n",
" counter, possible_element = 0, None\n",
" for i in arr:\n",
" if counter == 0:\n",
" possible_element, counter = i, 1\n",
" elif i == possible_element:\n",
" counter += 1\n",
" else:\n",
" counter -= 1\n",
"\n",
" return possible_element\n",
"\n",
"def write_result(lda_model,avg_topic_coherence,topic_dist):\n",
" \"\"\"Create a text document of the result\"\"\"\n",
" with open(\"result4.txt\", \"w\") as f:\n",
" # pprint(topic_dist, stream=f)\n",
" print(topic_dist, file=f)\n",
" print('Average topic coherence: %.4f.' % avg_topic_coherence,file=f)\n",
"\n",
"### CHANGE here #####\n",
"\n",
"def normalise_pred(arr,true_dict,pred_dict):\n",
" \"\"\" Finding the weighted average of the message type\n",
" Returns the highest probability message type.\n",
" \"\"\"\n",
"\n",
" fraction_array = []\n",
" for i in arr:\n",
" if i in true_dict:\n",
" fraction = pred_dict[i] / true_dict[i]\n",
" fraction_array.append(fraction)\n",
" else:\n",
" print(\"no similarities for {}\".format(i))\n",
" print(fraction_array)\n",
" index, value = max(enumerate(fraction_array), key=operator.itemgetter(1))\n",
"\n",
" return arr[index]\n",
"\n",
"\n",
"def count_element(array):\n",
" \"\"\"Counts the unique message types in list\n",
" Returns Dictionary of type : times\n",
" \"\"\"\n",
" unique_elements = list(Counter(array).keys())\n",
" element_frequency = list(Counter(array).values())\n",
"\n",
" dict = {}\n",
"\n",
" for index,key in enumerate(unique_elements):\n",
" dict[key] = element_frequency[index]\n",
"\n",
" return dict\n",
"\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zM4Xhrkh8Bdk",
"colab_type": "text"
},
"source": [
"# Setup for hierachical clustering"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_iZUpvrU7-L2",
"colab_type": "code",
"colab": {}
},
"source": [
"import pickle\n",
"\n",
"\n",
"with open(\"/content/drive/My Drive/DSO Presentation/Models/docs.txt\", \"rb\") as fp:\n",
" docs_ori = pickle.load(fp)\n",
"\n",
"with open(\"/content/drive/My Drive/DSO Presentation/Models/y_true.txt\", \"rb\") as fp:\n",
" y_ori =pickle.load(fp)\n",
"\n",
"# with open(\"/content/drive/My Drive/DSO Presentation/Models/y_pred.txt\", \"rb\") as fp:\n",
"# y_pred = pickle.load(fp)\n",
"\n",
"Y_labels = pd.read_csv (r'/content/drive/My Drive/DSO Presentation/Models/labels.csv')\n",
"# X_dist = pd.read_csv (r'/content/drive/My Drive/DSO Presentation/Models/topic_dist.csv')\n",
"\n",
"Y_labels.sort_values('pred_labels')\n",
"\n",
"# Finding Cluster\n",
"# cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 0]\n",
"# cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 2]\n",
"cluster_zero = Y_labels.loc[Y_labels['pred_labels'] == 4]\n",
"\n",
"# # Counting values of cluster\n",
"# print(cluster_zero['true_labels'].value_counts())\n",
"# print(cluster_two['true_labels'].value_counts())\n",
"# print(cluster_four['true_labels'].value_counts())\n",
"\n",
"# # Getting Index of cluster to list\n",
"# cluster_zero.index.values.tolist()\n",
"# cluster_two.index.values.tolist()\n",
"# cluster_four.index.values.tolist()\n",
"\n",
"\n",
"# Getting msg_type for sub cluster\n",
"# Change values here\n",
"sub_index = cluster_zero.index.values.tolist()\n",
"msg_type= []\n",
"for index in sub_index:\n",
" index_msg = y_ori[index]\n",
" msg_type.append(index_msg)\n",
"\n",
"# Getting Docs for sub cluster\n",
"docs = []\n",
"for index in sub_index:\n",
" index_docs = docs_ori[index]\n",
" docs.append(index_docs)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "njQa79US_nbu",
"colab_type": "text"
},
"source": [
"# LDA"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3KEY5clkqlP4",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "e310ea97-7dee-4efa-d2cb-13e66a763548"
},
"source": [
"\"\"\" Clusters the message type using Latent Dirichlet Allocation\"\"\"\n",
"true_dict = count_element(msg_type)\n",
"docs = n_gram(docs)\n",
"dictionary = create_dict(docs)\n",
"corpus = create_corpus(docs)\n",
"\n",
"# Set training parameters.\n",
"num_topics = 5\n",
"chunksize = 1 # how many documents are processed at a time\n",
"passes = 50 # how often we train the model on the entire corpus.\n",
"iterations = 1000\n",
"eval_every = 1 # For logging\n",
"minimum_probability = 0.0\n",
"n_clusters = 5\n",
"\n",
"\n",
"# Make a index to word dictionary.\n",
"temp = dictionary[0] # initialize the dictionary\n",
"id2word = dictionary.id2token\n",
"\n",
"# Train the model on the corpus.\n",
"lda_model = LdaModel(\n",
" corpus=corpus,\n",
" id2word=id2word,\n",
" chunksize=chunksize,\n",
" alpha='auto',\n",
" eta='auto',\n",
" iterations=iterations,\n",
" num_topics=num_topics,\n",
" passes=passes,\n",
" eval_every=eval_every,\n",
")\n",
"\n",
"# # Train a multicore LDA model\n",
"# lda_model = LdaMulticore(\n",
"# corpus=corpus,\n",
"# id2word=id2word,\n",
"# chunksize=chunksize,\n",
"# alpha='auto',\n",
"# eta='auto',\n",
"# iterations=iterations,\n",
"# num_topics=num_topics,\n",
"# passes=passes,\n",
"# eval_every=eval_every,\n",
"# minimum_probability=0.0,\n",
"# workers=1,\n",
"# )\n",
"temp_file = datapath(\"model\")\n",
"lda_model.save(temp_file) # saving the model in \"tempfile\"\n",
"\n",
"top_topics = lda_model.top_topics(corpus)\n",
"# Get topic distribution and forms a list\n",
"topic_dist = [lda_model.get_document_topics(item,minimum_probability=0.0) for item in corpus]\n",
"# sm = similarity_matrix(corpus, dictionary)\n",
"\n",
"\n",
"\n",
"# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics."
],
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/gensim/models/phrases.py:598: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class\n",
" warnings.warn(\"For a faster implementation, use the gensim.models.phrases.Phraser class\")\n",
"/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-Eg4FizB2plH",
"colab_type": "text"
},
"source": [
"# K Means"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4puKE1D_tISJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 390
},
"outputId": "06b732f5-1b6c-439d-8a15-e8cdcb284ca5"
},
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"\n",
"topic_dist = [lda_model.get_document_topics(item, minimum_probability=0.0) for item in corpus]\n",
"X = pd.DataFrame(topic_dist) # Dataframe of the result. Use Jupyter notebook to view.\n",
"\n",
"entry_num = 1 # index one -> the probability of message in topics\n",
"\n",
"# Removing the id from the tuple, leaving the probablity of each word being in topic\n",
"for row in X.iterrows():\n",
" for i in range(0, num_topics):\n",
" row[entry_num][i] = row[entry_num][i][1]\n",
"\n",
"# Setting Parameters\n",
"n_init = 10\n",
"\n",
"# Using PCA with Kmeans\n",
"# PCA first to reduce dimensionality for visualisation\n",
"pca = PCA(n_components=2)\n",
"PC = pca.fit_transform(X)\n",
"\n",
"# Applying Kmeans to get labels(cluster no)\n",
"kmeans = KMeans(n_clusters=n_clusters, n_init=n_init).fit_predict(PC)\n",
"\n",
"# # Using Kmeans only\n",
"# kmeans = KMeans(n_clusters=num_topics, n_init=10).fit_predict(X)\n",
"\n",
"\n",
"# Dataframe with labels\n",
"Y = pd.DataFrame()\n",
"Y[\"true_labels\"] = msg_type\n",
"cluster_predicted = kmeans.tolist()\n",
"Y[\"pred_labels\"] = cluster_predicted\n",
"Y.groupby(\"pred_labels\")\n",
"Y[\"pred_labels\"] = cluster_predicted\n",
"Y.groupby(\"pred_labels\")\n",
"clustered_labels = {}\n",
"for (i,row) in Y.iterrows():\n",
" if row[\"pred_labels\"] in clustered_labels:\n",
" clustered_labels[row[\"pred_labels\"]].append(row[\"true_labels\"])\n",
" else:\n",
" clustered_labels[row[\"pred_labels\"]] = [row[\"true_labels\"]]\n",
"\n",
"y_pred = []\n",
"for i in clustered_labels:\n",
" # Labelling the predicted cluster\n",
" pred_dict = count_element(clustered_labels[i]) \n",
" maj = normalise_pred(clustered_labels[i],true_dict,pred_dict) ### THREERER IS A MISTAKE HERE!!\n",
" # maj = majority_element(clustered_labels[i])\n",
" cluster_maj = [maj for i in range(len(clustered_labels[i]))]\n",
" # print(cluster_predicted)\n",
" y_pred.extend(cluster_maj) # Adding to the list of predicted labels for cluster\n",
"\n",
"y_true = []\n",
"for i in clustered_labels:\n",
" y_true.extend(clustered_labels[i])\n",
"\n",
"fig,ax = plt.subplots()\n",
"\n",
"# Plotting Kmeans\n",
"# Iterating through no of categories\n",
"for i in np.unique(kmeans):\n",
" plotx = []\n",
" ploty = []\n",
" for j in range(PC.shape[0]):\n",
" if kmeans[j] == i:\n",
" plotx.append(PC[j][0])\n",
" ploty.append(PC[j][1])\n",
"\n",
" # Plotting the graph\n",
" plt.scatter(plotx, ploty, label=i) # projected points to the axis\n",
"\n",
"ax.legend()\n",
"\n"
],
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": [
"[0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.14285714285714285, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.2857142857142857, 0.2857142857142857, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]\n",
"[1.0]\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x7f53a5968a90>"
]
},
"metadata": {
"tags": []
},
"execution_count": 26
},
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEQCAYAAACtGP9YAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAbf0lEQVR4nO3dfXBVVZrv8e9jEkkKbHJRqLzRA+iIIB15SWsjXd4GWnDEjI5OU1r2VHu1yn+cMdgzTElZbWewbkk11Q2xnKkpauyanm4bi3EQDFFoWrjXub5OJIj4gi/oFAmhyGgFhU40hOf+cRKGxLwd9j5nn73P71NFHc86OWs/WNSPzVprr2XujoiIxNcFURcgIiLBKMhFRGJOQS4iEnMKchGRmFOQi4jEnIJcRCTmIgtyM/ulmR03s4Mh9LXYzPaf86vbzG4Jo04RkVxnUa0jN7PrgJPAv7j7nBD7nQR8CFS5+x/C6ldEJFdFdkfu7i8Cn53bZmaXmtlOM3vDzP7dzK44j67/HHheIS4i+SLXxsg3AX/l7guAvwH+4Tz6uB3YHGpVIiI5rDDqAvqZ2QTgWuBfzay/eVzfZ7cCa4f4Wpu7Lz+nj3LgW8CuzFYrIpI7cibISf3roNPd5w7+wN23AlvH0MdK4Bl37wm7OBGRXJUzQyvu/jnwsZn9AMBSrkqzmzvQsIqI5Jkolx9uBl4BZppZq5ndA9wJ3GNmbwJvAzen0d80YCrwf8OvVkQkd0W2/FBERMKRM0MrIiJyfiKZ7Lzkkkt82rRpUVxaRCS23njjjf9y98mD2yMJ8mnTptHc3BzFpUVEYsvM/nOodg2tiIjEnIJcRCTmFOQiIjGXS092iohkVE9PD62trXR3d0ddyoiKi4upqqqiqKhoTD+vIBeRvNHa2spFF13EtGnTOGdPp5zi7nz66ae0trYyffr0MX1HQyuSNScaG/lgyVLenTWbD5Ys5URjY9QlSZ7p7u7m4osvztkQBzAzLr744rT+1aA7csmKE42NtP/kYbzvD+fpo0dp/8nDAEysrY2yNMkzuRzi/dKtUXfkQR3YAhvmQH1p6vXAlqgryknHN2w8G+L9vLub4xs2RlSRSHIoyIM4sAUa74cTRwBPvTberzAfwun29rTaRZJs586dzJw5k8suu4x169YF7k9BHsQLa6Gna2BbT1eqXQYoLC9Pq10kqXp7e7nvvvt4/vnneeedd9i8eTPvvPNOoD4V5EGcaE2vPY9NeWAVVlw8oM2Ki5nywKqIKhIZ3baWNhat28P0B5tYtG4P21raAvf5+uuvc9lllzFjxgwuvPBCbr/9drZv3x6oTwV5EBOr0mvPYxNrayl/ZC2FFRVgRmFFBeWPrNVEp+SsbS1trNn6Fm2dXTjQ1tnFmq1vBQ7ztrY2pk6devZ9VVUVbW3B+tSqlSCWPpwaEz93eKWoJNUuXzOxtlbBLbGxftchunp6B7R19fSyftchbplXGVFVQ9MdeRDVK6H2MZg4FbDUa+1jqXYRibWjnV1ptY9VZWUlR44cOfu+tbWVyspgfzHojjyo6pUKbpEEqigtoW2I0K4oLQnU77e//W0++OADPv74YyorK3nqqaf47W9/G6hP3ZGLiAxh9fKZlBQVDGgrKSpg9fKZgfotLCzk8ccfZ/ny5cyaNYuVK1dy5ZVXBusz0LdFsuhEYyPHN2zkdHs7heXlTHlglcbcJWP6x8HX7zrE0c4uKkpLWL18Zijj4zfeeCM33nhj4H76KcglFvSIv0ThlnmVOTexORQNrUgs6BF/keEFDnIzKzaz183sTTN728z+LozCRM6lR/xFhhfGHfmXwBJ3vwqYC9xgZt8JoV+Rs/SIv8jwAge5p5zse1vU98uD9ityLj3iLzK8UMbIzazAzPYDx4Hd7v7aED9zr5k1m1lzR0dHGJeVPKJH/EWGF8qqFXfvBeaaWSnwjJnNcfeDg35mE7AJoKamRnfskjY94i9Jcffdd7Njxw6mTJnCwYMHR//CKEJdteLuncBe4IYw+xURSZK77rqLnTt3htZfGKtWJvfdiWNmJcD1wHtB+xURiVyGTgC77rrrmDRpUih9QThDK+XAr8ysgNRfDFvcfUcI/YqIRKf/BLD+3U37TwCDnNtfKXCQu/sBYF4ItYiI5I6RTgDLsSDXk50iIkOJ0QlgCnIRkaHE6AQwBbmIyFCWPpw68etcIZ0Adscdd7Bw4UIOHTpEVVUVTzzxRKD+tPuhiMhQ+sfBX1ibGk6ZWJUK8RDGxzdv3hy4j3MpyEVEhhOTE8A0tCIiEnMKchGRmFOQi4jEnIJcRCTmFOQiIjGnIBcRyaIjR46wePFiZs+ezZVXXklDQ0PgPrX8UEQkiwoLC/n5z3/O/Pnz+eKLL1iwYAHXX389s2fPPu8+dUcuIjKMpsNNLHt6GdW/qmbZ08toOtwUuM/y8nLmz58PwEUXXcSsWbNoa2sL1KfuyEVEhtB0uIn6l+vp7u0GoP1UO/Uv1wOwYsaKUK7xySef0NLSwjXXXBOoH92Ri4gMoWFfw9kQ79fd203DvuBj2gAnT57ktttuY+PGjXzjG98I1JeCXERkCMdOHUurPR09PT3cdttt3Hnnndx6662B+1OQy6hONDbywZKlvDtrNh8sWcqJxsaoSxLJuLLxZWm1j5W7c8899zBr1ix+/OMfB+qrn4JcRnSisZH2nzzM6aNHwZ3TR4/S/pOHFeaSeHXz6yguKB7QVlxQTN38ukD9vvTSS/z6179mz549zJ07l7lz5/Lcc88F6lOTnTKi4xs24t0Dxwm9u5vjGzYysbY2oqpEMq9/QrNhXwPHTh2jbHwZdfPrAk90fve738XdwyjxLAW5jOh0e3ta7SJJsmLGitBWqGSShlZkRIXl5Wm1i0j2KchlRFMeWIUVDxwntOJipjywKqKK0qfJWkk6Da3IiPrHwY9v2Mjp9nYKy8uZ8sCq2IyP90/W9o/z90/WArH5PYiMRkEuo5pYWxvb0NNkreSDwEMrZjbVzPaa2Ttm9raZBVubIxIiTdZKPghjjPw08NfuPhv4DnCfmZ3/Nl4iIdJkreSa7u5urr76aq666iquvPJKfvrTnwbuM3CQu3u7u+/r++8vgHeByqD9ioQhCZO1kizjxo1jz549vPnmm+zfv5+dO3fy6quvBuoz1FUrZjYNmAe8NsRn95pZs5k1d3R0hHlZkWFNrK2l/JG1FFZUgBmFFRWUP7JW4+MyJplY8WRmTJgwAUjtudLT04OZBeoztMlOM5sA/Buwyt0/H/y5u28CNgHU1NSE+1iTyAjiPFkr0cnkiqfe3l4WLFjAhx9+yH333Zcb29iaWRGpEH/S3beG0aeISJRGWvEUVEFBAfv376e1tZXXX3+dgwcPBuovjFUrBjwBvOvuvwjan4hILsjGiqfS0lIWL17Mzp07A/UTxh35IuAvgCVmtr/v140h9CsiEplMrXjq6Oigs7MTgK6uLnbv3s0VV1wRqM/AY+Tu/v+AYCP1IiI5ZsoDqwaMkUM4K57a29v50Y9+RG9vL2fOnGHlypXcdNNNgfrUk50iIkPI1PYU1dXVtLS0hFHiWQpyEZFhxGXFk3Y/lKzSToQi4dMduWSNdiIUyQzdkUvWZHJdrkg+U5BL1mgnQpHMUJBL1mgnQpHMUJBL1mgnQpH/1tvby7x58wKvIQdNdkoWxf3YOJEwNTQ0MGvWLD7//Gt7DKZNQS5ZFZd1uSIA7792jFe2f8TJz75kwqRxLLz5Ui6/pixwv62trTQ1NfHQQw/xi18E36JKQysiIkN4/7Vj7H3yPU5+9iUAJz/7kr1Pvsf7rx0L3PeqVav42c9+xgUXhBPBCnKJhB4Mklz3yvaPOP3VmQFtp786wyvbPwrU744dO5gyZQoLFiwI1M+5NLQiWacHgyQO+u/Ex9o+Vi+99BLPPvsszz33HN3d3Xz++ef88Ic/5De/+c1596k7csk6PRgkcTBh0ri02sfq0UcfpbW1lU8++YSnnnqKJUuWBApxUJBH68AW2DAH6ktTrwe2RF1RVujBIImDhTdfSuGFAyOy8MILWHjzpRFVNDwFeVQObIHG++HEEcBTr43350WYB30wSOPrkg2XX1PG4juvOHsHPmHSOBbfeUUoq1b6fe9732PHjh2B+9EYeVReWAs9XQPberpS7dUro6kpS4Js2K/xdcmmy68pCzW4M0V35FE50Zpee4JMrK2l/JG1FFZUgBmFFRWUP7J2TEGs8XWRr9MdeVQmVvUNqwzRngfO98Egja9LUO5O6sz43OXuaf287sijsvRhKCoZ2FZUkmqXYWnjLQmiuLiYTz/9NO2gzCZ359NPP6V40L5EI9EdeVT6x8FfWJsaTplYlQrxhI+PB5WpA3ElP1RVVdHa2kpHR0fUpYyouLiYqqqx/+tcQR6l6pUK7jRp4y0JoqioiOnTp0ddRugU5BI72nhLZCCNkYuIxFwoQW5mvzSz42Z2MIz+RERk7MK6I/9n4IaQ+hIRkTSEEuTu/iLwWRh9iYhIerI2Rm5m95pZs5k15/rSHxGROMlakLv7JnevcfeayZMnZ+uyIiKJp1UrIiIxpyAXEYm5sJYfbgZeAWaaWauZ3RNGvyK5QPufS64L5clOd78jjH5Eco32P5c40NCKyAi0/7nEgYJcZATa/1ziQEEuMgLtfy5xoCAXGcGUB1Zhgzb41/7nkmu0ja3ICLT/ucSBglxkFNr/XHKdhlZERGJOQS4iEnMaWhHJM9ta2li/6xBHO7uoKC1h9fKZ3DKvMuqyJAAFuUge2dbSxpqtb9HV0wtAW2cXa7a+BaAwjzENrYjkkfW7Dp0N8X5dPb2s33UooookDApykTxytLMrrXaJBwW5SB6pKC1Jq13iIVFB3nS4iWVPL6P6V9Use3oZTYeboi5JJKesXj6TkqKCAW0lRQWsXj4zoookDImZ7Gw63ET9y/V096Z2qms/1U79y/UArJixIsLKRHJH/4SmVq0ki7l71i9aU1Pjzc3Nofa57OlltJ/6+o505ePL+d2f/y7Ua4mIRMHM3nD3msHtiRlaOXbqWFrtIiJJkZggLxtfllZ7bB3YAhvmQH1p6vXAlqgrEpGIJSbI6+bXUVwwcLvR4oJi6ubXRVRRBhzYAo33w4kjgKdeG+9XmIvkucQE+YoZK6i/tp7y8eUYRvn4cuqvrU/WROcLa6Fn0Hrfnq5Uu4jkrcSsWoFUmCcquAc70Zpeu4jkhcTckeeFiVXptYtIXlCQx8nSh6Fo0BN4RSWpdhHJWwryOKleCbWPwcSpgKVeax9LtYtI3gpljNzMbgAagALgn9x9XRj9yhCqVyq4RWSAwHfkZlYA/D3wJ8Bs4A4zmx20XxERGZswhlauBj5098Pu/hXwFHBzCP2KiMgYhBHklcCRc9639rWJiEgWZG2y08zuNbNmM2vu6OjI1mVFRBIvjCBvA6ae876qr20Ad9/k7jXuXjN58uQQLisiIhBOkP8H8MdmNt3MLgRuB54Nod+s0qEUIhJXgZcfuvtpM/tLYBep5Ye/dPe3A1eWRTqUQkTiLJQxcnd/zt0vd/dL3f1/h9FnNjXsazgb4v26e7tp2NcQUUUiImOnJzvRoRQiEm8KcvLoUAoRSSQFOXlyKIWIJFai9iM/X/0Tmg37Gjh26hhl48uom1+niU4RiQUFeZ/EH0ohIokVq6EVrfUWEfm62NyRa623iMjQYnNHrrXeIiJDi02Qa623iMjQYhPkWustIjK02AS51nqLJNu2ljYWrdvD9AebWLRuD9tavraJqgwjNpOdWustkrKtpY31uw5xtLOLitISVi+fyS3z4n2Wy7aWNtZsfYuunl4A2jq7WLP1LYDY/96ywdw96xetqanx5ubmrF9XJO4GBx5ASVEBj976rVgH3qJ1e2jr7Ppae2VpCS89uCSCinKTmb3h7jWD22MztCIisH7XoQEhDtDV08v6XYciqigcR4cI8ZHaZSAFuUiMJDXwKkpL0mqXgRTkIjGS1MBbvXwmJUUFA9pKigpYvXxmRBXFi4JcJEaSGni3zKvk0Vu/RWVpCUZqbDzu4/7ZFJtVK2PRdLhJq1ok0fqDLWmrViD1e0vC7yMKiQly7cUi+UKBJ4MlZmhFe7GISL5KTJBrLxYRyVeJCfKM7sVyYAtsmAP1panXA1uC9ykiEpLEBHnG9mI5sAUa74cTRwBPvTberzAXkZyRmCBfMWMF9dfWUz6+HMMoH19O/bX1wSc6X1gLPYMetujpSrWLiOSAxKxagQydu3miNb12EZEsC3RHbmY/MLO3zeyMmX1tI5dEmFiVXruISJYFHVo5CNwKvBhCLecto4cyL30YigY9/lxUkmoXEckBgYZW3P1dADMLp5rzkPEHgapXpl5fWJsaTplYlQrx/nYRkYhlbYzczO4F7gX45je/GVq/Iz0IFNp4efVKBbeI5KxRg9zMfg8MtRj7IXffPtYLufsmYBOkDpYYc4Wj0INAIpLvRg1yd/9+Ngo5X2Xjy2g/1T5ku4hIrnj/tWO8sv0jTn72JRMmjWPhzZdy+TXh5FTs15GP9CBQRidBRUTG6P3XjrH3yfc4+dmXAJz87Ev2Pvke778WzshB0OWHf2ZmrcBCoMnMdoVSVRqGexAIoP7letpPteP42UlQhbmIZNsr2z/i9FdnBrSd/uoMr2z/KJT+g65aeQZ4JpRKAhjqQaBlTy/L/CSoiMgY9N+Jj7U9XbEfWhmOJkFFJFdMmDQurfZ0JTbIM7oboohIGhbefCmFFw6M28ILL2DhzZeG0n9ig3ysuyFqQlREMu3ya8pYfOcVZ+/AJ0wax+I7rwht1UoiNs0a6azOkc7w1PFwIpItl19TFlpwDxb7IB8tjEcK5Kw8FSoikmGxH1oJclanJkRFJAliH+RBwlgToiKSBLEP8iBhnLHj4UREsij2QR4kjDN2PFw/HdosIlkQ+8nOsaxOGe37GZnY7D+0uf+8z/5Dm0Fb4opIqMw9tB1lx6ympsabm5uzft2s2jAnFd6DTZwKDxzMfj0iEntm9oa7f+1YzdgPreQsHdosIlmiIM8UHdosIlmiIM8UHdosIlmiIM+U6pVQ+1hqTBxLvdY+polOEQld7Fet5DQd2iwiWaA7chGRmNMduYiEaltLG+t3HeJoZxcVpSWsXj6TW+ZVRl1WoinIRSQ021raWLP1Lbp6egFo6+xizda3ABTmGaShFREJzfpdh86GeL+unl7W7zoUUUX5QUEuIqE52tmVVruEQ0EuIqGpKC1Jq13CoSAXySPbWtpYtG4P0x9sYtG6PWxraQu1/9XLZ1JSVDCgraSogNXLZ4Z6HRlIk50ieSIbE5H9/WjVSnYFCnIzWw/UAl8BHwH/y907wyhMRMI10kRkmEF7y7xKBXeWBR1a2Q3Mcfdq4H1gTfCSRCQTNBGZXIGC3N1/5+6n+96+CmhrP5EcpYnI5ApzsvNu4PnhPjSze82s2cyaOzo6QrysiIyFJiKTa9QxcjP7PTDUScYPufv2vp95CDgNPDlcP+6+CdgEqROCzqtaETlvmohMrlGD3N2/P9LnZnYXcBOw1KM4Ny5XHdgCL6xNnQg0sSq1D7l2QpSIaSIymYKuWrkB+Fvgf7r7H8IpKQF08LKIZFHQMfLHgYuA3Wa238z+MYSa4u+Ftf8d4v16ulLtIiIhC3RH7u6XhVVIoujgZRHJIj2inwk6eFlEskhBngk6eFlEsig2Qd50uIllTy+j+lfVLHt6GU2Hm6IuaXg6eFlEsigWm2Y1HW6i/uV6unu7AWg/1U79y/UArJixIsLKRqCDl0UkS2JxR96wr+FsiPfr7u2mYV9DRBWJiOSOWAT5sVPH0moXEcknsQjysvFD7RAwfLuISD6JRZDXza+juKB4QFtxQTF18+siqkhEJHfEYrKzf0KzYV8Dx04do2x8GXXz63J3olNEJItiEeSQCnMFt0j2bWtp046JOS42QS4i2ZeNcz4luFiMkYtINEY651Nyh4JcRIalcz7jQUEuIsPSOZ/xoCAXkWHpnM940GSniAxL53zGg4JcREakcz5zn4ZWRERiTkEuIhJzCnIRkZhTkIuIxJyCXEQk5szds39Rsw7gP7NwqUuA/8rCdcIUt5pVb+bFrWbVmzl/5O6TBzdGEuTZYmbN7l4TdR3piFvNqjfz4laz6s0+Da2IiMScglxEJOaSHuSboi7gPMStZtWbeXGrWfVmWaLHyEVE8kHS78hFRBJPQS4iEnOJD3IzW29m75nZATN7xsxKo65pJGb2AzN728zOmFnOLokysxvM7JCZfWhmD0Zdz2jM7JdmdtzMDkZdy1iY2VQz22tm7/T9eaiLuqbRmFmxmb1uZm/21fx3Udc0FmZWYGYtZrYj6lrOV+KDHNgNzHH3auB9YE3E9YzmIHAr8GLUhQzHzAqAvwf+BJgN3GFms6OtalT/DNwQdRFpOA38tbvPBr4D3BeD/8dfAkvc/SpgLnCDmX0n4prGog54N+oigkh8kLv779z9dN/bV4GqKOsZjbu/6+65frLt1cCH7n7Y3b8CngJujrimEbn7i8BnUdcxVu7e7u77+v77C1JBk9ObgnvKyb63RX2/cno1hZlVASuAf4q6liASH+SD3A08H3URCVAJHDnnfSs5HjJxZmbTgHnAa9FWMrq+YYr9wHFgt7vnes0bgb8FzkRdSBCJOCHIzH4PlA3x0UPuvr3vZx4i9c/VJ7NZ21DGUq8IgJlNAP4NWOXun0ddz2jcvReY2zcX9YyZzXH3nJyXMLObgOPu/oaZfS/qeoJIRJC7+/dH+tzM7gJuApZ6DiycH63eGGgDpp7zvqqvTUJkZkWkQvxJd98adT3pcPdOM9tLal4iJ4McWAT8qZndCBQD3zCz37j7DyOuK22JH1oxsxtI/dPpT939D1HXkxD/AfyxmU03swuB24FnI64pUczMgCeAd939F1HXMxZmNrl/VZiZlQDXA+9FW9Xw3H2Nu1e5+zRSf4b3xDHEIQ+CHHgcuAjYbWb7zewfoy5oJGb2Z2bWCiwEmsxsV9Q1DdY3efyXwC5Sk3Bb3P3taKsamZltBl4BZppZq5ndE3VNo1gE/AWwpO/P7f6+O8dcVg7sNbMDpP6y3+3usV3SFyd6RF9EJOby4Y5cRCTRFOQiIjGnIBcRiTkFuYhIzCnIRUQCCntTNjPrPWe10qhLe7VqRUQkIDO7DjgJ/Iu7zwmhv5PuPmGsP687chGRgIbalM3MLjWznWb2hpn9u5ldkanrK8hFRDJjE/BX7r4A+BvgH9L4brGZNZvZq2Z2y2g/nIi9VkREcknfZmfXAv+a2m0BgHF9n90KrB3ia23uvrzvv//I3dvMbAawx8zecvePhrueglxEJHwXAJ3uPnfwB30boI24CZq7t/W9Hjaz/0NqG+Nhg1xDKyIiIevbcvhjM/sBpDZBM7OrxvJdM/sfZtZ/934JqX133hnpOwpyEZGAhtmU7U7gHjN7E3ibsZ+iNQto7vveXmCdu48Y5Fp+KCISc7ojFxGJOQW5iEjMKchFRGJOQS4iEnMKchGRmFOQi4jEnIJcRCTm/j/bKz4HUm+zwgAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Cj2A9B8l7TWe",
"colab_type": "text"
},
"source": [
"K means Elbow Method"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4fsXwYID7SdJ",
"colab_type": "code",
"colab": {}
},
"source": [
"# sse = {}\n",
"# for k in range(1, 10):\n",
"# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(X)\n",
"# X[\"clusters\"] = kmeans.labels_\n",
"# #print(data[\"clusters\"])\n",
"# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center\n",
"# plt.figure()\n",
"# plt.plot(list(sse.keys()), list(sse.values()))\n",
"# plt.xlabel(\"Number of cluster\")\n",
"# plt.ylabel(\"SSE\")\n",
"# plt.show()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "TVjCRwA02cby",
"colab_type": "text"
},
"source": [
"# Metrics"
]
},
{
"cell_type": "code",
"metadata": {
"id": "uDTAiEVxWXFJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"outputId": "34f1ac3e-3a27-407e-9525-90b3039dc3dc"
},
"source": [
"\n",
"# Finding the unique values in the truth. This will tell us number of unique clusters\n",
"unique_types_true = np.unique(np.array(y_true))\n",
"unique_clusters_true = len(unique_types_true) # number of unique clusters true\n",
"cluster_no = len(np.unique(np.array(y_pred)))# Number of unique clusters predicted\n",
"accuracy = accuracy_score(y_true, y_pred) # Calculating accuracy score\n",
"\n",
"print(\"Number of Message types : {}\".format(unique_clusters_true))\n",
"print(\"Number of Clusters : {}\".format(unique_clusters_true))\n",
"print(\"Number of Clusters predicted : {}\".format(cluster_no))\n",
"print(\"Percentage Accuracy in predicted cluster : {:.2%} \".format(accuracy))\n",
"\n",
"\n",
"class_labels = list(set(y_true)) # Creating a list of unqiue labels\n",
"cm = confusion_matrix(y_true, y_pred, labels=class_labels) # Creating a confusion matrix from y_true and y_pred\n",
"\n",
"# Calculating precision and recall\n",
"# Using micro average as there might be a class imbalance (i.e more examples of one class than another)\n",
"metric_score_micro = precision_recall_fscore_support(y_true, y_pred, average=\"micro\")\n",
"print(\"Precision Score is {:.2f}\".format(metric_score_micro[0]))\n",
"print(\"Recall Score is {:.2f}\".format(metric_score_micro[1]))\n",
"print(\"F Score is {:.2f}\".format(metric_score_micro[2]))\n",
"\n",
"plt.imshow(cm, cmap=plt.cm.Blues, interpolation='nearest')\n",
"plt.colorbar()\n",
"plt.title('Confusion Matrix without Normalization')\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"tick_marks = np.arange(len(set(y_true))) # length of classes\n",
"\n",
"# tick_marks\n",
"plt.xticks(tick_marks, class_labels, fontsize=6)\n",
"plt.yticks(tick_marks, class_labels, fontsize=7)\n",
"\n",
"# plotting text value inside cells\n",
"thresh = cm.max() / 2.\n",
"for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
" plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment='center',\n",
" color='white' if cm[i, j] > thresh else 'black')\n",
" \n",
"plt.show() # Plots the confusion matrix\n"
],
"execution_count": 28,
"outputs": [
{
"output_type": "stream",
"text": [
"Number of Message types : 4\n",
"Number of Clusters : 4\n",
"Number of Clusters predicted : 4\n",
"Percentage Accuracy in predicted cluster : 73.33% \n",
"Precision Score is 0.73\n",
"Recall Score is 0.73\n",
"F Score is 0.73\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAT0AAAERCAYAAAAJ9XDQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZxVdf3H8dd7GHEDFcWNARPEVDDNwHLJpZ8bIGT209yDtChTUdNM27RyKe2XWpRGrj8XILRUcAHzlxJmIiKa4kYhymKKW0ooOH5+f5wzeBmHmXtn7rnL3Pfz8TiPuWe53/M5d8793O/5nnO+RxGBmVmtqCt3AGZmpeSkZ2Y1xUnPzGqKk56Z1RQnPTOrKU56ZlZTai7pSVpX0mRJb0ma1IFyjpE0rZixlYOkuyWNzLD870q6qpX5oyTNyGr91UzS1pJCUn06nsn/StJTkvYtdrmVqmKTnqSjJc2S9I6kJek//LNFKPowYHNgk4g4vL2FRMRNEXFgEeJZjaR90x39j82m75xOvz/Pcs6TdGNby0XE0Ii4vp3htikiLoyIr6YxrfYlzpqkFyTt38r8ps/6N82mz5A0KvMAC1SM/5Wk6ySd36zcgRFxf4eCqyIVmfQkfQu4DLiQJEFtBfwGOKQIxX8MeC4i3i9CWVl5Fdhd0iY500YCzxVrBUpU5P+/xJYBx0nauqMFlSqZWwdFREUNwIbAO8DhrSyzNklSXJwOlwFrp/P2BRYCZwCvAEuAr6TzfgSsAFam6zgBOA+4MafsrYEA6tPxUcA/gbeB+cAxOdNn5LxvD+AR4K307x458+4HfgI8mJYzDei5hm1riv9K4KR0WhdgEfBD4P6cZS8HXgL+DTwK7JVOH9JsOx/PieOCNI7lQP902lfT+VcAt+aU/zPgPkAtxLkAGJS+Pib9zAam4ycAt6WvV32+wIvpcu+kw+5NnyPwc+CN9DMemrOeXsAdwOvAPOBrOfOuA85v/tmlr28APki38x3grFY+618B1+ZMnwGMSl/XAd9Pt/cV4H+BDZvtKyek2zY93Z4HgUuBN0n2nT3S6S+lZYzMWdfBwGPp//Al4LxW9sXc/9XjOZ/jO+ly+6bzJgEvk+yL03P+L6NJ9okV6Xsmp9NfAPbvyHermoZK/KXfHVgH+GMry3wP2A34JLAz8GmSHbPJFiTJs4Fkh/y1pB4RcS5J7XFiRHSLiKtbC0TS+sAvSb6E3Ul23jktLLcxcGe67CbAL4A7m9XUjga+AmwGdAXObG3dJF+uL6evDwKeJNkJcz1C8hlsDNwMTJK0TkTc02w7d855z3EkO393ki9yrjOAT6TtbHuRfHYjI93jm3mA5EsAsA/Jl3vvnPEHWnhP0/yN0rgeSsc/AzwL9AQuBq6WpHTeBJIvWi+SpokLJf1XC2WvJiKOI0lEI9J1XdzK4hcA/y1puxbmjUqHzwH9gG7A2GbL7APsQPJ/atqeJ0j2hZvTbdiV5EfmWGCspG7psstI/s8bkSTAEyV9IY/t2zndrm7At0g+v9np7LuBbUn2tdnATel7xqWvL07fO6KFotv13Wor3kpSiUlvE2BptH74eQzw44h4JSJeJanBHZczf2U6f2VE3EXyq9bSDp2PD4AdJa0bEUsi4qkWljkYeD4iboiI9yNiPPAMkLtTXRsRz0XEcuD3JDvVGkXEX4GN0y/il0mSYPNlboyI19J1/g/Jr3Rb23ldRDyVvmdls/L+Q/I5/gK4ETglIhauoZwHSL7sAHsBF+WMrynprcmCiPhdRDQC1wNbAptL6gPsCXwnIt6NiDnAVXz4Y1AUEfEySc36xy3MPgb4RUT8MyLeAc4Bjmx2KHteRCxL/7cA8yPi2nR7JgJ9SPbH9yJiGklNq3+67vsj4u8R8UFEPAGM58PPsU1pO/f5wOcj4t9pmddExNsR8R5JTXtnSRvmWWQpv1tlUYlJ7zWgZxvtI71YvZayIJ22qoxmSfM/JL/QBYmIZcARwDeAJZLulLR9HvE0xdSQM/5yO+K5ATiZpJbxkZqvpDMlPZ2eiX6T5Be4ZxtlvtTazIh4mKTWJpLkvCYPAHtJ2pLk8Pv3wJ5p29iGtFAjbsWqzyZNvJB8Pr2A1yPi7Zxlm3+uxfIz4CBJOzeb3tK+Vk/S1tyk+Wf6r5zXywEiovm0bgCSPiPpz5JelfQWyb7W1v+Q9L19SD73kRHxXDqti6SfSvqHpH+THLqSb5mU6LtVTpWY9B4C3gNaq+IvJjkh0WQrPnrol69lwHo541vkzoyIqRFxAEnt4xngd3nE0xTTonbG1OQG4JvAXTnJAID08PMs4EtAj4jYiKQNp+mwcE3d57TarY6kk0hqjIvT8lsuJGIeyQ5/CjA9rWW8THLoPCMiPih03S1YTFLb7Z4zLfdzbfV/V8j6IuI1kvarn7QQQ/N97X1WT2wd6aroZpI2yz4RsSFJjVOtvyW59Aq4DbgsIu7OmXU0yQm//Ul+fLZuekuesRbzu1WRKi7pRcRbJA32v5b0BUnrSVpL0lBJTe0y44HvS9pUUs90+TYvz1iDOcDekrZKDwHOaZohaXNJh6Rte++RVOVb+jLfBXw8vcymXtIRwABgSjtjAiAi5pMc6nyvhdndSb58rwL1kn4IbJAz/1/A1oWcoZX0cZJDpWNJDmnOktTaYfgDJDXRpkPZ+5uNN/cqyefXL594IuIl4K/ARZLWkbQTSTtS0/96DjBM0saStgBOa1bEv/JdV+oXJO22O+RMGw+cLqlv2g7X1FZarLP/3Ulqs+9K+jRJ0srHNcAzLbRVdifZV18j+UG4sNn8tj6TYn63KlLFJT2AtH3qWyQNqK+SHD6cTPLLBskXcxZJY/HfSRprz/9oSXmt616SdpcnSM6A5iaqujSOxSRnD/cBTmyhjNeA4SQnAl4jqSENj4il7YmpWdkzIqKlX9qpwD0kl7EsAN5l9cOspguvX5M0mzakzQk3Aj+LiMcj4nngu8ANktZew9seIPmSTV/DePNt+Q/p2WNJb0rara24gKNIaiuLSQ7xz42IP6XzbiA5i/kCyRnxic3eexHJF/hNSW2dOCKtrV5McmKoyTXpeqaTnFl+l6R2WyzfBH4s6W2SBNNak0KuI4FD0+tYm4a9SNp+F5DUhucCf2v2vquBAelnchsfVbTvVqVSyyfmzMw6p4qs6ZmZZcVJz8yqgqRrJL0i6cmcaZdIekbSE5L+KGmjtspx0jOzanEdyd1Gue4FdoyInUjat89p/qbmnPTMrCpExHSSE4q506blnEn/G9C7rXKq9gZp1a8b6tq97QWrzC47bFXuEMwAWLDgBZYuXdrmNYOt6bLBxyLeX972gkAsf/UpkrPjTcalt87l63g+egb/I6o36XXtztrbfancYRTdgw83v63TrDz2/MzgDpcR7y/P+3v67pxfvxsR7VqppO+RXLd6U1vLVm3SM7NqIMi4B7O078PhwH5r6BxjNU56ZpYdAXVdsiteGkJyM8A+zW/VXBOfyDCzbEn5DW0Wo/Ek9+ZvJ2mhpBNIuvnqDtwraY6kK9sqxzU9M8tQ8Q5vI+KoFia32idmS5z0zCxbedTiSslJz8yyIzI/kVEoJz0zy1B+7XWl5KRnZtnK8OxtezjpmVmGsr9Or1BOemaWHeHDWzOrMa7pmVnt8OGtmdUSAV18IsPMaonb9Mysdvjw1sxqjWt6ZlZTXNMzs5qRZ7dRpVRZKbgCXHnuMSy47yJmTfruqmkXnvYF5vzh+8yceA4T/+drbNht3TJGWBzTpt7DTgO3Y+D2/bnk4p+WO5yi6azbBVW8bXVd8htKFU7J1lQlbpj8Nw456derTbvvb88w6PAL+fQRF/H8glf49vEHlim64mhsbOS0MSdx++S7eeyJuUyaMJ6n584td1gd1lm3C6p529ITGfkMJeKk18yDs//B62+t3uv0fX97hsbGDwCY+ff5NGze5vOEK9ojM2eyzTb96duvH127duXwI45kyuTbyx1Wh3XW7YIq37Yi9ZxcLE56BfryIbsz9cFq+IVds8WLF9G7d59V4w0NvVm0aFEZIyqOzrpdUMXb1tSfXgXV9HwiowBnnXAQjY0fMOGuR8odilmV8HV6VevYEZ9h2N47MvTrvyx3KB3Wq1cDCxe+tGp80aKFNDQ0lDGi4uis2wVVvm0V1p9eSVKwpFGShqevt5d0v6QHJF0u6TpJn0rnzZN0haRbJK1fitjyccAeO/CtUftz2Gm/Zfm7K8sdTocN3nVX5s17nhfmz2fFihVMmjiBg4d/vtxhdVhn3S6o8m2rsDa9ctb0JkXEWElrA+OBLwJzIuJESd8FtgceLXVQ1180ir0GbUvPjbox756f8JMr7+LbXzmQtbvWM+WKkwGY+fcXGHPBhFKHVjT19fVcevlYRhx8EI2NjYwcdTwDBg4sd1gd1lm3C6p42+TD24+IiPckvZeO7izpV8ByYE7zZSWNBkYDsFa3TOIZec51H5l2/W0PZbKuchoydBhDhg4rdxhF11m3C6p42yrs4uRSJb03gE3T15sBf2iakdb01k5HH4+IU9ZUSESMA8YB1K23WWQTqpkVk2o06U0FfitpZ6AncBtwiqRtgQ2B80sUh5mVUNJbfA0mvYh4FxjZbPItLSx3WCniMbMSkVBdDSY9M6tdlVbTq6zTKmbW6UjKa8ijnGskvSLpyZxpG0u6V9Lz6d8ebZXjpGdmmSpW0gOuA4Y0m3Y2cF9EbAvcl463yknPzLKjAoY2RMR04PVmkw8Brk9fXw98oa1y3KZnZpkRedfi2mvziFiSvn4Z2LytNzjpmVmm6uryPqDsKWlWzvi49NrcvERESGrz+l0nPTPLVAE1vaURMbjA4v8lacuIWCJpS+CVtt7gNj0zy04R2/TW4A4+vAZ4JNBmz6qu6ZlZporVpidpPLAvyWHwQuBc4KfA7yWdACwAvtRWOU56ZpaZYp7IiIij1jBrv0LKcdIzs0z5NjQzqx2qvNvQnPTMLFNOemZWU5z0zKxmlOCOjII56ZlZtior5znpmVmGVNBtaCXhpGdmmfLhrZnVlsrKeU56ZpYt1/TMrGYU0CtyyTjpmVmmnPSKZJcdtuLBh8eWO4yi+82D/yx3CFagb+7Zr9whVDTfe2tmNcU1PTOrHe5wwMxqiYAKy3lOemaWJZ+9NbMaU+cTGWZWM+TDWzOrIcI1PTOrMa7pmVlN8YkMM6sdbtMzs1oi5E5Ezay2uKZnZjXFbXpmVjvcpmdmtSS597aysl5ltTCaWacj5TfkV5ZOl/SUpCcljZe0TqHxOOmZWabq6pTX0BZJDcAYYHBE7Ah0AY4sNB4f3ppZdorfn149sK6klcB6wOJCC3BNz8wy09SfXp6Htz0lzcoZRueWFRGLgJ8DLwJLgLciYlqhMbmmZ2YZKqg/vaURMXiNJUk9gEOAvsCbwCRJx0bEjYVE5KTXhmlT7+HMb51KY2Mjo47/Kt8+6+xyh1QU5x+xN2uvtz51dV2o69KF08fdXu6QiqKzbhdU775YxKPb/YH5EfFqUq7+AOwBOOkVS2NjI6eNOYk7776Xht69+exuuzJ8+OfZYcCAcodWFCdeehPdNtq43GEUXWfcrqrdF1XUrqVeBHaTtB6wHNgPmFVoIW7Ta8UjM2eyzTb96duvH127duXwI45kyuTOU3Ow6lGt+2LTdXr5DG2JiIeBW4DZwN9J8te4QmNyTa8VixcvonfvPqvGGxp6M3Pmw2WMqHgkMe7bo5BgtxFHsfuIo8odUlF01u2q5n2xmGdvI+Jc4NyOlOGkV6NO/tVENtx0C95+Yym/PXMkm221Ddvs/Olyh9VhnXW7qlmF3ZCRbdKTNAp4HTiApKbbFbgZGEVyuvlUSZ8AHgF6AmOB94BNgSsi4t4s42tLr14NLFz40qrxRYsW0tDQUMaIimfDTbcAoHuPnnziswfy4tOPd4rk0Fm3q5r3xVq8Da0vcFdEnBwRo4G/ptPXkbQucAxwT87yZwAnA58vQWytGrzrrsyb9zwvzJ/PihUrmDRxAgcPL3tYHfbe8v/w7n/eWfX62Vl/Ycu+Hy9zVB3XWbcLqnhfzPMavVLmxVIc3p4KrPqpjYgVaeb/A3A0sD7JNTdNfgasQ3IR4mrSixVHA/TZaqvsIk7V19dz6eVjGXHwQTQ2NjJy1PEMGDgw8/Vm7Z03lnLtD04E4IPGRj613wi2/8w+ZY6q4zrrdkH17otJJ6KVVdMrRdK7HBgETAWQ1DWd/iBwH/A94Nic5b8TEe+0VFBEjCM9WzNo0ODIKuBcQ4YOY8jQYaVYVcls0msrzrz6znKHUXSddbuaVOu+WFdhh7elSHrzgRGSRpDcIDwhZ95nI2KlpGNbfquZVbsKy3nZJr2IuC59eUezWQ80W25U+nIUZtZpqPgdDnSYL1kxs0xVWJOek56ZZasWT2SYWY0SyRncSuKkZ2aZqrCKnpOemWUoz84ESslJz8wyVWE5z0nPzLIjavPiZDOrYT57a2Y1o9SdCeTDSc/MMuXDWzOrKZWV8lpJepJ+BayxJ5OIGJNJRGbWqVTTJSsFP2XIzCxXcva23FGsbo1JLyKuL2UgZtYJqQo7EZW0KfAdYABJj8YARMR/ZRiXmXUSlXZ4m88zMm4CniZ51sWPgBdIHuRjZtaqpsPbfIZSySfpbRIRVwMrI+KBiDgecC3PzPJSrId9F0s+l6ysTP8ukXQwsBjYOLuQzKwzqayD2/yS3vmSNiR5NOOvgA2A0zONysw6BQm6VNuJjIiYkr58C/hctuGYWWdTaScy8jl7ey0tXKSctu2ZmbWqmDlP0kbAVcCOJHnp+Ih4qJAy8jm8nZLzeh3gUJJ2PTOzVgkV+97by4F7IuKw9Bna6xVaQD6Ht7fmjksaD8wodEVmVoOK2MtKem5hb9JHxUbECmBFoeW0p8OBbYHN2vG+onrv/Q+Y/8qycodRdEd9sne5Q8jMg/OXljsEK4MC2vR6Ssq9/XVcRIzLGe8LvApcK2ln4FHg1IgoKBHk06b3Nqu36b1McoeGmVmrBHTJP+ktjYjBrcyvBz4FnBIRD0u6HDgb+EEhMeVzeNu9kALNzHIV8YqVhcDCiHg4Hb+FJOkVFk9bC0i6L59pZmYtKdZtaBHxMvCSpO3SSfsBcwuNp7X+9NYhOTPSU1IPPrywegOgodAVmVntSbqLL+rZ21OAm9Izt/8EvlJoAa0d3n4dOA3oRdJg2BT5v4Gxha7IzGpTMW/IiIg5QGvtfm1qrT+9y4HLJZ0SEb/qyErMrHZV2A0ZefWy8kF6FTQAknpI+maGMZlZJyGgXsprKJV8kt7XIuLNppGIeAP4WnYhmVln0vQYyLaGUsnn4uQukhQRASCpC9A127DMrDOQin4bWoflk/TuASZK+m06/nXg7uxCMrPOpMJyXl5J7zvAaOAb6fgTwBaZRWRmnUqFdaeX1x0ZH0h6GNgG+BLQE7i19XeZmaW3oVVY1mvt4uSPA0elw1JgIkBEuCNRM8tPiR/6k4/WanrPAH8BhkfEPABJ7ibezAqiCntKRmuXrHwRWAL8WdLvJO1H5T3jw8wqWFU9AjIibouII4HtgT+T3JK2maQrJB1YqgDNrLpVTdJrEhHLIuLmiBgB9AYew/3pmVmeqvG5t6ukd2OMSwczs1Ylj4AsdxSra0938WZmeau0OzIqLAdXliWLFjLqsKEM33cQIz43mBuu+nW5QyqK008azSf69+Zzu+9S7lAy0djYyLePPJCLxny53KEU1bSp97DTwO0YuH1/Lrn4p+UOJy9VdSLDoL6+nrPOvYgp9z/KhMl/5ubrfse8554ud1gddsTRx3HTLZPLHUZm7rr5Khr6blvuMIqqsbGR08acxO2T7+axJ+YyacJ4np5bcKfBZVFpHQ446bVi0823YMAnPgnA+t2602/b7Xjl5SVljqrjdttzL3r06FHuMDLx2r8WM3vGfex36FHlDqWoHpk5k2226U/ffv3o2rUrhx9xJFMm317usPIg6vIcSsVJL0+LXlrA008+zk67dKjTVsvYtZecy7Gnfp+6us61ay9evIjevfusGm9o6M2iRYvKGFF+hGt6VWnZsnc49WvHcM6Pfka37huUOxxbg0en38uGG/dkmwE7lTsUayKor1NeQ6n47G0bVq5cyWlfO4bhhx7BAcMOKXc41opn5sxi1gPTeGzG/7FixXssX/Y2v/zeKYy5oPqfdtCrVwMLF760anzRooU0NFT+87maanqVpGRJT9K6wKUkDxrqQfLotn7As8DawGXAz4F/AH2AH0TEE6WKryURwQ/O+Cb9+m/HqK+fUs5QLA/HjDmHY8acA8BTs/7KHf97ZadIeACDd92VefOe54X58+nV0MCkiRO47oabyx1WXirtkpWSJb2IWA58Q9K+wI7Ax4AxEfE0QPpIt2URcbKkPYADSPruW0XSaJK+/diyoQ9Zm/3IQ9xx63g+vsNADj1gdwBOO/s89tnvoMzXnaUTTziOh2ZM5/XXljJoQD/OOPsHHP3lgp+kZyVUX1/PpZePZcTBB9HY2MjIUcczYODAcoeVlwrLeWU9vP1YU8IDiIgVktaXdBmwPvDD5m+IiFV3g+y486ci6wAHfXoP5i56J+vVlNwVV99Q7hAyN3DwHgwcvEe5wyiqIUOHMWTosHKHURBReScOypn0FkjaLiKehdVqeqeVMSYzKybV8OFtCy4CLpb0bhpH52h8MbNVkjsyajzpRcT9wP3p6FebzT6spMGYWeYqK+X5khUzy1iFVfQqro3RzDqV/PrSy7c/PUldJD0maUp7I3JNz8wyk8HZ21OBp4F23xrlmp6ZZapOymtoi6TewMHAVR2JxzU9M8uOKKQr+J6SZuWMj0uvzW1yGXAW0L0jITnpmVlmCjy8XRoRLXZjJGk48EpEPJre1dVuTnpmlqkiPfRnT+DzkoYB6wAbSLoxIo4ttCC36ZlZppTn0JqIOCciekfE1sCRwP+1J+GBa3pmliEBXSrsQj0nPTPLVLFzXrO7ugrmpGdmGRKqsBvRnPTMLFMVdnTrpGdm2UkuWamsrOekZ2bZKfGTzvLhpGdmmar5/vTMrHYknYiWO4rVOemZWaZ89tbMakqFHd066ZlZtlzTM7Oa4TY9M6steXYQWkpOemaWqcpKeVWc9Naur6PvZuuXOwwrwPAde5U7hEy8sWxFuUPIxPsfRIfL8HNvzazmVFbKc9Izs6xVWNZz0jOzTPnw1sxqSmWlPCc9M8tahWU9Jz0zy0zy0J/KynpOemaWHfenZ2a1psJynpOemWVJxXrYd9E46ZlZpios5znpmVl2hA9vzazWVFjWc9Izs0z5khUzqymV1qZXV+4AzKwTS6/Ty2dosyipj6Q/S5or6SlJp7YnJNf0zCxTRTy8fR84IyJmS+oOPCrp3oiYW0ghTnpmlhlRvMPbiFgCLElfvy3paaABcNIzs8qRRZOepK2BXYCHC32vk56ZZSv/rNdT0qyc8XERMe4jxUndgFuB0yLi34WG4xMZbZg29R52GrgdA7fvzyUX/7Tc4RSNt6u6nH7SaD7Rvzef232XcodSsLr0iWhtDcDSiBicM7SU8NYiSXg3RcQf2hVPxzanc2tsbOS0MSdx++S7eeyJuUyaMJ6n5xbUfFCRvF3V54ijj+OmWyaXO4x2UZ5Dm+UkN/FeDTwdEb9obzxOeq14ZOZMttmmP3379aNr164cfsSRTJl8e7nD6jBvV/XZbc+96NGjR7nDaJ9iZT3YEzgO+C9Jc9JhWKHhuE2vFYsXL6J37z6rxhsaejNzZsHtphXH22WlUsxORCNiBkU4L+KkZ2bZcSei1aVXrwYWLnxp1fiiRQtpaGgoY0TF4e2yUqqwnJddm56kUZKGp6+3l7RS0pWS/pH+PbHZ+LGS7pY0VtLtknbKKrZ8Dd51V+bNe54X5s9nxYoVTJo4gYOHf77cYXWYt8tKJ+lENJ+hVEpZ07sgIs6TdEtEfKNpYu64pC9ExMmS9gAOAJ4oYXwfUV9fz6WXj2XEwQfR2NjIyFHHM2DgwHKGVBTerupz4gnH8dCM6bz+2lIGDejHGWf/gKO//JVyh5WXSju8VURkU7A0CvgS8CKwEfBMTtI7LGe5VeOS7gaeBdYHfpjedpJb5mhgNECfrbYa9Nw/FmQSu1kh3li2otwhZGLIvrvz+GOPdihl7fTJQXHHnx7Ma9m+m677aEQM7sj68pF1Te83ETFF0vbAkXksvywiTlvTzPRixXEAgwYNziZbm1lxVVhNzycyzCxTNdOJaERcl/P6GeC89PVhzZY7rKXXZtY5VFqbnmt6ZpYdQZ2TnpnVlsrKek56ZpaZYnYiWixOemaWqQrLeU56ZpYt1/TMrKaU8hazfDjpmVmmKivlOemZWYbyfaZtKTnpmVmmauaODDMzoOKOb530zCxTFZbznPTMLEurHu9YMZz0zCwzlXhHhh8BaWY1xTU9M8tUpdX0nPTMLFO+ZMXMaocvTjazWlKJJzKc9MwsUz68NbOaUmk1PV+yYmaZUp5DXmVJQyQ9K2mepLPbE4+Tnpllq0hZT1IX4NfAUGAAcJSkAYWG46RnZpkRUCflNeTh08C8iPhnRKwAJgCHFBpT1bbpzZ796NJ119KCEq2uJ7C0ROsqpc66XdB5t62U2/WxjhYwe/ajU9ddSz3zXHwdSbNyxsdFxLic8QbgpZzxhcBnCo2papNeRGxaqnVJmhURg0u1vlLprNsFnXfbqm27ImJIuWNozoe3ZlYtFgF9csZ7p9MK4qRnZtXiEWBbSX0ldQWOBO4otJCqPbwtsXFtL1KVOut2Qefdts66XW2KiPclnQxMBboA10TEU4WWo4goenBmZpXKh7dmVlOc9MysprhNL4ek84DLIuJNSZcBuwC3AluSXAm+PzAQ6AaMbU97QjlIGgX8A9gHWA48BwwCukbEdyVdCKwA7geOA94CpkTE/5Ul4AJIWgs4CRgC/IVkn34LWB+4HLgTmAYsj4hflCvOQqT/rzkRMUfSfcBk4FBgCrAusAfwAPAoMBJ4DOgbESeVJ+Lq4qTXusci4peSugPfBZ4FbgLWBnYHqiLppc6MiFVXr0salP7dotlyk4GHgFOAik96EROnXDsAAAMqSURBVLESuEzSRiSJ/a2IuLtpvqTHIuICSVeVLcj2OUbSvkBjRFwmaeuIuARW/Thfkf44D4uIn0v6jaSu6Z0K1gonvdWtANZaw7ymMz5HA+8Cl5UkouJp6YzVBOD3JDWl/06njQA+BVRbkmjSfDt3kfQ9YHw5gumAm9Ka3tZtLLeppDNJaoZOeHlw0lvdbcAZkl4HHgdGSRrD6oe3N0fEnDLG2F6Xpl/+d0kObwFeBL4OLMlZbnJE3Fbq4IrkXuBkSdvz4eHtYxFxQXnDytSrEfHzcgdRTXzJipnVFJ+9NbOa4qRnZjXFSc/MaoqTnpnVFCc9W42kRklzJD0paZKk9TpQ1nWSDktfX9Va196S9pW0RzvW8YKUdyeVZk569hHLI+KTEbEjyXWL38idKaldlzlFxFcjYm4ri+xLcqeBWaac9Kw1fwH6p7Wwv0i6A5grqYukSyQ9IukJSV8HUGJs+rSqPwGbNRUk6X5Jg9PXQyTNlvS4pPvSC3C/AZye1jL3krSppFvTdTwiac/0vZtImibpqfQuiwp7wKBVOl+cbC1Ka3RDgXvSSZ8CdoyI+ZJGk9zutauktYEHJU0juVd5O5InVW0OzAWuaVbupsDvgL3TsjaOiNclXQm803ShraSbgUsjYoakrUj6UNsBOBeYERE/lnQwcEKmH4R1Ok561ty6kpruOPkLcDXJYefMiJifTj8Q2KmpvQ7YENgW2BsYHxGNwGJJLd27uxswvamsiHh9DXHsDwzQh0/J2kBSt3QdX0zfe6ekN9q5nVajnPSsueUR8cncCWniWZY7CTglIqY2W25YEeOoA3aLiHdbiMWs3dymZ+0xFTgx7dYJSR+XtD4wHTgibfPbEvhcC+/9G7C3pL7pezdOp78NdM9ZbhpJTy+kyzUl4ukknT4gaSjQo2hbZTXBSc/a4yqS9rrZkp4Efkty1PBH4Pl03v+SdFG1moh4FRgN/EHS48DEdNZk4NCmExnAGGBweqJkLh+eRf4RSdJ8iuQw98WMttE6KXc4YGY1xTU9M6spTnpmVlOc9MyspjjpmVlNcdIzs5ripGdmNcVJz8xqyv8DKXCk5kWtliEAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "g2YAsoRWtW9S",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87
},
"outputId": "fc6a4772-77ea-49c7-a17e-a733d330c5fe"
},
"source": [
" print(\"y pred :\" + str(y_pred))\n",
" print(\"y true : \" + str(y_true))\n",
" print(clustered_labels)"
],
"execution_count": 29,
"outputs": [
{
"output_type": "stream",
"text": [
"y pred :['TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'HTTP']\n",
"y true : ['TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'TCP', 'TCP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'HTTP']\n",
"{2: ['TCP', 'TCP', 'TCP', 'TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP'], 1: ['TCP', 'ICMP', 'ICMP', 'ICMP', 'ICMP', 'ICMP'], 3: ['TCP', 'TCP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP'], 0: ['UDP', 'UDP', 'UDP', 'UDP', 'UDP', 'UDP'], 4: ['HTTP']}\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PgW-LgTPSCfV",
"colab_type": "text"
},
"source": [
"# Script to get Message Type Resolution\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "30_-4DxJsfys",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 403
},
"outputId": "4e8550bb-ce25-4b19-a711-b399ecc4b88f"
},
"source": [
"for cluster, msg_type in sorted(clustered_labels.items()):\n",
" values, counts = np.unique(msg_type, return_counts=True)\n",
" print(\"\\n\\nCluster {} : {} / {} \".format(cluster,len(msg_type),len(y_true)))\n",
" for i in range(len(values)):\n",
" print(\"{} : {} / {} = {:.2%}\".format(values[i],counts[i],len(msg_type),counts[i]/len(msg_type)))\n",
"\n",
"\n",
"\n"
],
"execution_count": 30,
"outputs": [
{
"output_type": "stream",
"text": [
"\n",
"\n",
"Cluster 0 : 6 / 30 \n",
"UDP : 6 / 6 = 100.00%\n",
"\n",
"\n",
"Cluster 1 : 6 / 30 \n",
"ICMP : 5 / 6 = 83.33%\n",
"TCP : 1 / 6 = 16.67%\n",
"\n",
"\n",
"Cluster 2 : 9 / 30 \n",
"ICMP : 5 / 9 = 55.56%\n",
"TCP : 4 / 9 = 44.44%\n",
"\n",
"\n",
"Cluster 3 : 8 / 30 \n",
"TCP : 2 / 8 = 25.00%\n",
"UDP : 6 / 8 = 75.00%\n",
"\n",
"\n",
"Cluster 4 : 1 / 30 \n",
"HTTP : 1 / 1 = 100.00%\n"
],
"name": "stdout"
}
]
}
]
}
@yongkangc
Copy link
Author

Message Clustering during internship using LDA Kmeans

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment