Created
February 8, 2017 05:19
-
-
Save hiroto-takatoshi/be0ced688e10afab5834d90067b6c11d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import lda\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X = lda.utils.ldac2dtm(open('ap.dat'), offset=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('vocab.txt') as f:\n", | |
" vocab = tuple(f.read().split())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO:lda:n_documents: 2246\n", | |
"INFO:lda:vocab_size: 10473\n", | |
"INFO:lda:n_words: 435838\n", | |
"INFO:lda:n_topics: 20\n", | |
"INFO:lda:n_iter: 1500\n", | |
"INFO:lda:<0> log likelihood: -5476651\n", | |
"INFO:lda:<10> log likelihood: -4077671\n", | |
"INFO:lda:<20> log likelihood: -3901140\n", | |
"INFO:lda:<30> log likelihood: -3837499\n", | |
"INFO:lda:<40> log likelihood: -3801351\n", | |
"INFO:lda:<50> log likelihood: -3779830\n", | |
"INFO:lda:<60> log likelihood: -3763033\n", | |
"INFO:lda:<70> log likelihood: -3751735\n", | |
"INFO:lda:<80> log likelihood: -3742309\n", | |
"INFO:lda:<90> log likelihood: -3733977\n", | |
"INFO:lda:<100> log likelihood: -3727111\n", | |
"INFO:lda:<110> log likelihood: -3723015\n", | |
"INFO:lda:<120> log likelihood: -3718689\n", | |
"INFO:lda:<130> log likelihood: -3716380\n", | |
"INFO:lda:<140> log likelihood: -3712261\n", | |
"INFO:lda:<150> log likelihood: -3710904\n", | |
"INFO:lda:<160> log likelihood: -3707895\n", | |
"INFO:lda:<170> log likelihood: -3705384\n", | |
"INFO:lda:<180> log likelihood: -3704104\n", | |
"INFO:lda:<190> log likelihood: -3702793\n", | |
"INFO:lda:<200> log likelihood: -3701623\n", | |
"INFO:lda:<210> log likelihood: -3698983\n", | |
"INFO:lda:<220> log likelihood: -3697639\n", | |
"INFO:lda:<230> log likelihood: -3697154\n", | |
"INFO:lda:<240> log likelihood: -3694176\n", | |
"INFO:lda:<250> log likelihood: -3693392\n", | |
"INFO:lda:<260> log likelihood: -3692639\n", | |
"INFO:lda:<270> log likelihood: -3690817\n", | |
"INFO:lda:<280> log likelihood: -3690774\n", | |
"INFO:lda:<290> log likelihood: -3690850\n", | |
"INFO:lda:<300> log likelihood: -3690055\n", | |
"INFO:lda:<310> log likelihood: -3689924\n", | |
"INFO:lda:<320> log likelihood: -3688729\n", | |
"INFO:lda:<330> log likelihood: -3687494\n", | |
"INFO:lda:<340> log likelihood: -3687316\n", | |
"INFO:lda:<350> log likelihood: -3687768\n", | |
"INFO:lda:<360> log likelihood: -3686429\n", | |
"INFO:lda:<370> log likelihood: -3685422\n", | |
"INFO:lda:<380> log likelihood: -3684581\n", | |
"INFO:lda:<390> log likelihood: -3683734\n", | |
"INFO:lda:<400> log likelihood: -3683986\n", | |
"INFO:lda:<410> log likelihood: -3683651\n", | |
"INFO:lda:<420> log likelihood: -3683762\n", | |
"INFO:lda:<430> log likelihood: -3683547\n", | |
"INFO:lda:<440> log likelihood: -3682437\n", | |
"INFO:lda:<450> log likelihood: -3682080\n", | |
"INFO:lda:<460> log likelihood: -3681822\n", | |
"INFO:lda:<470> log likelihood: -3681101\n", | |
"INFO:lda:<480> log likelihood: -3682356\n", | |
"INFO:lda:<490> log likelihood: -3681683\n", | |
"INFO:lda:<500> log likelihood: -3680661\n", | |
"INFO:lda:<510> log likelihood: -3680511\n", | |
"INFO:lda:<520> log likelihood: -3680945\n", | |
"INFO:lda:<530> log likelihood: -3679211\n", | |
"INFO:lda:<540> log likelihood: -3678517\n", | |
"INFO:lda:<550> log likelihood: -3678854\n", | |
"INFO:lda:<560> log likelihood: -3677041\n", | |
"INFO:lda:<570> log likelihood: -3677213\n", | |
"INFO:lda:<580> log likelihood: -3678838\n", | |
"INFO:lda:<590> log likelihood: -3678379\n", | |
"INFO:lda:<600> log likelihood: -3677847\n", | |
"INFO:lda:<610> log likelihood: -3678484\n", | |
"INFO:lda:<620> log likelihood: -3678668\n", | |
"INFO:lda:<630> log likelihood: -3677680\n", | |
"INFO:lda:<640> log likelihood: -3676747\n", | |
"INFO:lda:<650> log likelihood: -3677636\n", | |
"INFO:lda:<660> log likelihood: -3677485\n", | |
"INFO:lda:<670> log likelihood: -3678316\n", | |
"INFO:lda:<680> log likelihood: -3678462\n", | |
"INFO:lda:<690> log likelihood: -3677225\n", | |
"INFO:lda:<700> log likelihood: -3676163\n", | |
"INFO:lda:<710> log likelihood: -3676840\n", | |
"INFO:lda:<720> log likelihood: -3676684\n", | |
"INFO:lda:<730> log likelihood: -3676653\n", | |
"INFO:lda:<740> log likelihood: -3677127\n", | |
"INFO:lda:<750> log likelihood: -3678371\n", | |
"INFO:lda:<760> log likelihood: -3676893\n", | |
"INFO:lda:<770> log likelihood: -3676615\n", | |
"INFO:lda:<780> log likelihood: -3675608\n", | |
"INFO:lda:<790> log likelihood: -3674729\n", | |
"INFO:lda:<800> log likelihood: -3675266\n", | |
"INFO:lda:<810> log likelihood: -3675882\n", | |
"INFO:lda:<820> log likelihood: -3675530\n", | |
"INFO:lda:<830> log likelihood: -3675181\n", | |
"INFO:lda:<840> log likelihood: -3675366\n", | |
"INFO:lda:<850> log likelihood: -3673611\n", | |
"INFO:lda:<860> log likelihood: -3675897\n", | |
"INFO:lda:<870> log likelihood: -3675115\n", | |
"INFO:lda:<880> log likelihood: -3674581\n", | |
"INFO:lda:<890> log likelihood: -3673408\n", | |
"INFO:lda:<900> log likelihood: -3674789\n", | |
"INFO:lda:<910> log likelihood: -3675349\n", | |
"INFO:lda:<920> log likelihood: -3674951\n", | |
"INFO:lda:<930> log likelihood: -3675805\n", | |
"INFO:lda:<940> log likelihood: -3674962\n", | |
"INFO:lda:<950> log likelihood: -3674557\n", | |
"INFO:lda:<960> log likelihood: -3674487\n", | |
"INFO:lda:<970> log likelihood: -3674774\n", | |
"INFO:lda:<980> log likelihood: -3674333\n", | |
"INFO:lda:<990> log likelihood: -3673754\n", | |
"INFO:lda:<1000> log likelihood: -3674767\n", | |
"INFO:lda:<1010> log likelihood: -3673660\n", | |
"INFO:lda:<1020> log likelihood: -3674027\n", | |
"INFO:lda:<1030> log likelihood: -3674486\n", | |
"INFO:lda:<1040> log likelihood: -3674413\n", | |
"INFO:lda:<1050> log likelihood: -3673640\n", | |
"INFO:lda:<1060> log likelihood: -3673161\n", | |
"INFO:lda:<1070> log likelihood: -3674066\n", | |
"INFO:lda:<1080> log likelihood: -3674446\n", | |
"INFO:lda:<1090> log likelihood: -3674491\n", | |
"INFO:lda:<1100> log likelihood: -3674487\n", | |
"INFO:lda:<1110> log likelihood: -3675207\n", | |
"INFO:lda:<1120> log likelihood: -3674765\n", | |
"INFO:lda:<1130> log likelihood: -3674330\n", | |
"INFO:lda:<1140> log likelihood: -3674069\n", | |
"INFO:lda:<1150> log likelihood: -3674506\n", | |
"INFO:lda:<1160> log likelihood: -3673248\n", | |
"INFO:lda:<1170> log likelihood: -3672928\n", | |
"INFO:lda:<1180> log likelihood: -3673012\n", | |
"INFO:lda:<1190> log likelihood: -3675123\n", | |
"INFO:lda:<1200> log likelihood: -3673590\n", | |
"INFO:lda:<1210> log likelihood: -3673205\n", | |
"INFO:lda:<1220> log likelihood: -3672103\n", | |
"INFO:lda:<1230> log likelihood: -3672740\n", | |
"INFO:lda:<1240> log likelihood: -3673131\n", | |
"INFO:lda:<1250> log likelihood: -3671889\n", | |
"INFO:lda:<1260> log likelihood: -3672175\n", | |
"INFO:lda:<1270> log likelihood: -3673819\n", | |
"INFO:lda:<1280> log likelihood: -3673232\n", | |
"INFO:lda:<1290> log likelihood: -3674267\n", | |
"INFO:lda:<1300> log likelihood: -3674006\n", | |
"INFO:lda:<1310> log likelihood: -3673166\n", | |
"INFO:lda:<1320> log likelihood: -3672681\n", | |
"INFO:lda:<1330> log likelihood: -3671634\n", | |
"INFO:lda:<1340> log likelihood: -3673190\n", | |
"INFO:lda:<1350> log likelihood: -3673596\n", | |
"INFO:lda:<1360> log likelihood: -3673502\n", | |
"INFO:lda:<1370> log likelihood: -3673227\n", | |
"INFO:lda:<1380> log likelihood: -3672041\n", | |
"INFO:lda:<1390> log likelihood: -3673048\n", | |
"INFO:lda:<1400> log likelihood: -3673159\n", | |
"INFO:lda:<1410> log likelihood: -3671847\n", | |
"INFO:lda:<1420> log likelihood: -3672968\n", | |
"INFO:lda:<1430> log likelihood: -3672145\n", | |
"INFO:lda:<1440> log likelihood: -3671737\n", | |
"INFO:lda:<1450> log likelihood: -3671631\n", | |
"INFO:lda:<1460> log likelihood: -3671606\n", | |
"INFO:lda:<1470> log likelihood: -3672180\n", | |
"INFO:lda:<1480> log likelihood: -3672690\n", | |
"INFO:lda:<1490> log likelihood: -3671788\n", | |
"INFO:lda:<1499> log likelihood: -3670611\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<lda.lda.LDA at 0x2b448d0e780>" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.fit(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Topic 0: soviet gorbachev union west german east germany moscow\n", | |
"Topic 1: united states iraq foreign israel bush war countries\n", | |
"Topic 2: bush dukakis campaign president democratic republican jackson presidential\n", | |
"Topic 3: house committee congress bill senate budget tax federal\n", | |
"Topic 4: court case attorney judge charges trial federal prison\n", | |
"Topic 5: city water miles area state fire new southern\n", | |
"Topic 6: market stock dollar trading late exchange new index\n", | |
"Topic 7: air flight plane space force navy defense aircraft\n", | |
"Topic 8: school students news university new women president college\n", | |
"Topic 9: south government africa president military united rebels african\n", | |
"Topic 10: police people killed two man army three city\n", | |
"Topic 11: year new show years john film york i\n", | |
"Topic 12: children mrs family hospital ms medical i wife\n", | |
"Topic 13: oil prices cents farmers food futures trade cent\n", | |
"Topic 14: study aids system environmental computer health program new\n", | |
"Topic 15: company million new inc corp billion bank co\n", | |
"Topic 16: i people time years dont think get say\n", | |
"Topic 17: percent year million billion last sales rate increase\n", | |
"Topic 18: party government political minister opposition people elections new\n", | |
"Topic 19: workers union employees strike new labor contract jobs\n" | |
] | |
} | |
], | |
"source": [ | |
"topic_word = model.topic_word_\n", | |
"n_top_words = 8\n", | |
"for i, topic_dist in enumerate(topic_word):\n", | |
" topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]\n", | |
" print('Topic {}: {}'.format(i, ' '.join(topic_words)))\n" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda root]", | |
"language": "python", | |
"name": "conda-root-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment