LowriWilliams · October 26, 2020 15:46
diff --git a/LDA_keywords.ipynb b/LDA_keywords.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>topic</th>\n",
       "      <th>relevance_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>01 update</th>\n",
       "      <td>4</td>\n",
       "      <td>1.198091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>02 day</th>\n",
       "      <td>1</td>\n",
       "      <td>1.198399</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>02 update</th>\n",
       "      <td>0</td>\n",
       "      <td>1.188615</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>03 day</th>\n",
       "      <td>2</td>\n",
       "      <td>1.199999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>04 update</th>\n",
       "      <td>4</td>\n",
       "      <td>1.199999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zyx zy</th>\n",
       "      <td>0</td>\n",
       "      <td>5.199787</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zyxel forum</th>\n",
       "      <td>3</td>\n",
       "      <td>1.199998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zz aint</th>\n",
       "      <td>3</td>\n",
       "      <td>1.199998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zzzs rate</th>\n",
       "      <td>0</td>\n",
       "      <td>1.195343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case</th>\n",
       "      <td>2</td>\n",
       "      <td>1.199992</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1358778 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      topic  relevance_score\n",
       "01 update                                 4         1.198091\n",
       "02 day                                    1         1.198399\n",
       "02 update                                 0         1.188615\n",
       "03 day                                    2         1.199999\n",
       "04 update                                 4         1.199999\n",
       "...                                     ...              ...\n",
       "zyx zy                                    0         5.199787\n",
       "zyxel forum                               3         1.199998\n",
       "zz aint                                   3         1.199998\n",
       "zzzs rate                                 0         1.195343\n",
       "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case      2         1.199992\n",
       "\n",
       "[1358778 rows x 2 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# index names\n",
    "docnames = ['Doc' + str(i) for i in range(len(data))]\n",
    "\n",
    "# Make the pandas dataframe\n",
    "df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=docnames)\n",
    "\n",
    "# Get dominant topic for each document\n",
    "dominant_topic = np.argmax(df_document_topic.values, axis=1)\n",
    "df_document_topic['dominant_topic'] = dominant_topic\n",
    "\n",
    "# Topic-Keyword Matrix\n",
    "df_topic_keywords = pd.DataFrame(lda_model.components_)\n",
    "\n",
    "# Assign Column and Index\n",
    "df_topic_keywords.columns = vectorizer.get_feature_names()\n",
    "df_topic_keywords.index = topicnames\n",
    "\n",
    "df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())\n",
    "df_scores = pd.DataFrame(df_topic_keywords.max())\n",
    "\n",
    "tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)\n",
    "tmp.columns = ['topic', 'relevance_score']\n",
    "\n",
    "display(tmp)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>topic</th>\n",
	" <th>relevance_score</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>01 update</th>\n",
	" <td>4</td>\n",
	" <td>1.198091</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>02 day</th>\n",
	" <td>1</td>\n",
	" <td>1.198399</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>02 update</th>\n",
	" <td>0</td>\n",
	" <td>1.188615</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>03 day</th>\n",
	" <td>2</td>\n",
	" <td>1.199999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>04 update</th>\n",
	" <td>4</td>\n",
	" <td>1.199999</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>zyx zy</th>\n",
	" <td>0</td>\n",
	" <td>5.199787</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>zyxel forum</th>\n",
	" <td>3</td>\n",
	" <td>1.199998</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>zz aint</th>\n",
	" <td>3</td>\n",
	" <td>1.199998</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>zzzs rate</th>\n",
	" <td>0</td>\n",
	" <td>1.195343</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case</th>\n",
	" <td>2</td>\n",
	" <td>1.199992</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>1358778 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" topic relevance_score\n",
	"01 update 4 1.198091\n",
	"02 day 1 1.198399\n",
	"02 update 0 1.188615\n",
	"03 day 2 1.199999\n",
	"04 update 4 1.199999\n",
	"... ... ...\n",
	"zyx zy 0 5.199787\n",
	"zyxel forum 3 1.199998\n",
	"zz aint 3 1.199998\n",
	"zzzs rate 0 1.195343\n",
	"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case 2 1.199992\n",
	"\n",
	"[1358778 rows x 2 columns]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"# index names\n",
	"docnames = ['Doc' + str(i) for i in range(len(data))]\n",
	"\n",
	"# Make the pandas dataframe\n",
	"df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=docnames)\n",
	"\n",
	"# Get dominant topic for each document\n",
	"dominant_topic = np.argmax(df_document_topic.values, axis=1)\n",
	"df_document_topic['dominant_topic'] = dominant_topic\n",
	"\n",
	"# Topic-Keyword Matrix\n",
	"df_topic_keywords = pd.DataFrame(lda_model.components_)\n",
	"\n",
	"# Assign Column and Index\n",
	"df_topic_keywords.columns = vectorizer.get_feature_names()\n",
	"df_topic_keywords.index = topicnames\n",
	"\n",
	"df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())\n",
	"df_scores = pd.DataFrame(df_topic_keywords.max())\n",
	"\n",
	"tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)\n",
	"tmp.columns = ['topic', 'relevance_score']\n",
	"\n",
	"display(tmp)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}