Created
October 26, 2020 15:46
-
-
Save LowriWilliams/10af096be994221dd87ca1c0cd0477bc to your computer and use it in GitHub Desktop.
aspect_sa/LDA_keywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>topic</th>\n", | |
| " <th>relevance_score</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>01 update</th>\n", | |
| " <td>4</td>\n", | |
| " <td>1.198091</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>02 day</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.198399</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>02 update</th>\n", | |
| " <td>0</td>\n", | |
| " <td>1.188615</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>03 day</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.199999</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>04 update</th>\n", | |
| " <td>4</td>\n", | |
| " <td>1.199999</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyx zy</th>\n", | |
| " <td>0</td>\n", | |
| " <td>5.199787</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyxel forum</th>\n", | |
| " <td>3</td>\n", | |
| " <td>1.199998</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zz aint</th>\n", | |
| " <td>3</td>\n", | |
| " <td>1.199998</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zzzs rate</th>\n", | |
| " <td>0</td>\n", | |
| " <td>1.195343</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.199992</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>1358778 rows × 2 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " topic relevance_score\n", | |
| "01 update 4 1.198091\n", | |
| "02 day 1 1.198399\n", | |
| "02 update 0 1.188615\n", | |
| "03 day 2 1.199999\n", | |
| "04 update 4 1.199999\n", | |
| "... ... ...\n", | |
| "zyx zy 0 5.199787\n", | |
| "zyxel forum 3 1.199998\n", | |
| "zz aint 3 1.199998\n", | |
| "zzzs rate 0 1.195343\n", | |
| "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi case 2 1.199992\n", | |
| "\n", | |
| "[1358778 rows x 2 columns]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "# index names\n", | |
| "docnames = ['Doc' + str(i) for i in range(len(data))]\n", | |
| "\n", | |
| "# Make the pandas dataframe\n", | |
| "df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=docnames)\n", | |
| "\n", | |
| "# Get dominant topic for each document\n", | |
| "dominant_topic = np.argmax(df_document_topic.values, axis=1)\n", | |
| "df_document_topic['dominant_topic'] = dominant_topic\n", | |
| "\n", | |
| "# Topic-Keyword Matrix\n", | |
| "df_topic_keywords = pd.DataFrame(lda_model.components_)\n", | |
| "\n", | |
| "# Assign Column and Index\n", | |
| "df_topic_keywords.columns = vectorizer.get_feature_names()\n", | |
| "df_topic_keywords.index = topicnames\n", | |
| "\n", | |
| "df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())\n", | |
| "df_scores = pd.DataFrame(df_topic_keywords.max())\n", | |
| "\n", | |
| "tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)\n", | |
| "tmp.columns = ['topic', 'relevance_score']\n", | |
| "\n", | |
| "display(tmp)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment