Created
April 18, 2015 01:52
-
-
Save xccds/f2c870f08fe41e0fe164 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat_minor": 0, "cells": [{"execution_count": 141, "cell_type": "code", "source": "!head -n 10 txtdm.txt", "outputs": [{"output_type": "stream", "name": "stdout", "text": "The Neatest Little Guide to Stock Market Investing\r\nInvesting For Dummies, 4th Edition\r\nThe Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns\r\nThe Little Book of Value Investing\r\nValue Investing: From Graham to Buffett and Beyond\r\nRich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!\r\nInvesting in Real Estate, 5th Edition\r\nStock Investing For Dummies\r\nRich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss\r\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 142, "cell_type": "code", "source": "txt = [s.split() for s in open('txtdm.txt')]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 143, "cell_type": "code", "source": "ignore = \",|:|!|'\"\nstopwords = ['and','edition','for','in','little','of','the','to']", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 144, "cell_type": "code", "source": "import re", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 145, "cell_type": "code", "source": "txt = [[re.sub(ignore,'',w.lower()) for w in s ] for s in txt]", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 146, "cell_type": "code", "source": "txt = [[w for w in s if w not in stopwords] for s in txt]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 147, "cell_type": "code", "source": "txt = [' '.join(s) for s in txt]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 148, "cell_type": "code", "source": "txt", "outputs": [{"execution_count": 148, "output_type": "execute_result", "data": {"text/plain": "['neatest guide stock market investing',\n 'investing dummies 4th',\n 'book common sense investing only way guarantee your fair share stock market returns',\n 'book value investing',\n 'value investing from graham buffett beyond',\n 'rich dads guide investing what rich invest that poor middle class do not',\n 'investing real estate 5th',\n 'stock investing dummies',\n 'rich dads advisors abcs real estate investing secrets finding hidden profits most investors miss']"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 149, "cell_type": "code", "source": "from sklearn.feature_extraction.text import CountVectorizer", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 150, "cell_type": "code", "source": "model = CountVectorizer() \nxvec = model.fit_transform(txt)\nxvec", "outputs": [{"execution_count": 150, "output_type": "execute_result", "data": {"text/plain": "<9x44 sparse matrix of type '<type 'numpy.int64'>'\n\twith 63 stored elements in Compressed Sparse Row format>"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 151, "cell_type": "code", "source": "from sklearn.decomposition import NMF", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 152, "cell_type": "code", "source": "n_topics = 2", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 153, "cell_type": "code", "source": "nmf = NMF(n_components=n_topics,\n sparseness='data', init='nndsvd', random_state=0)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 154, "cell_type": "code", "source": "nmf.fit_transform(xvec)", "outputs": [{"execution_count": 154, "output_type": "execute_result", "data": {"text/plain": "array([[ 0.06206478, 0.32759923],\n [ 0.07006666, 0.14309777],\n [-0. , 0.91268535],\n [ 0.04919433, 0.21403733],\n [ 0.06947109, 0.18135713],\n [ 0.80019019, -0. ],\n [ 0.1820973 , 0.08738539],\n [ 0.04344757, 0.23748179],\n [ 0.74776168, -0. ]])"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 155, "cell_type": "code", "source": "import numpy as np", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 167, "cell_type": "code", "source": "np.round(nmf.components_,2)", "outputs": [{"execution_count": 167, "output_type": "execute_result", "data": {"text/plain": "array([[ 0.05, 0.13, 0.55, 0.55, 0.04, 0. , 0.04, 0.59, 0. ,\n 1.15, 0.59, 0.07, 0.69, 0. , 0.55, 0.04, 0.04, 0. ,\n 0.62, 0.55, 0.59, 1.4 , 0.55, 0. , 0.59, 0.55, 0.55,\n 0.03, 0.59, 0. , 0.59, 0.55, 0.69, 0. , 1.74, 0.55,\n 0. , 0. , 0.01, 0.59, 0.07, 0. , 0.59, 0. ],\n [ 0.12, 0.06, 0. , 0. , 0.15, 0.94, 0.15, 0. , 0.76,\n 0. , 0. , 0.31, 0.03, 0.76, 0. , 0.15, 0.15, 0.76,\n 0.23, 0. , 0. , 1.66, 0. , 1.03, 0. , 0. , 0. ,\n 0.27, 0. , 0.76, 0. , 0. , 0.03, 0.76, 0. , 0. ,\n 0.76, 0.76, 1.23, 0. , 0.32, 0.76, 0. , 0.76]])"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 160, "cell_type": "code", "source": "feature_names = model.get_feature_names()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 165, "cell_type": "code", "source": "n_top_words=5", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 166, "cell_type": "code", "source": "for topic_idx, topic in enumerate(nmf.components_):\n print(\"Topic #%d:\" % topic_idx)\n print(\" \".join([feature_names[i]\n for i in topic.argsort()[:-n_top_words - 1:-1]]))\n print()", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Topic #0:\nrich investing dads estate real\n()\nTopic #1:\ninvesting stock market book returns\n()\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment