Last active
July 12, 2016 07:19
-
-
Save devashishd12/584c2cfd586f0a56c8f4a1dc38b067c3 to your computer and use it in GitHub Desktop.
Benchmark testing for coherence measures in gensim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"\n", | |
"from sklearn.datasets import fetch_20newsgroups\n", | |
"from scipy.stats import pearsonr\n", | |
"from datetime import datetime\n", | |
"\n", | |
"from gensim.models import CoherenceModel\n", | |
"from gensim.corpora.dictionary import Dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dataset = fetch_20newsgroups(subset='all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"documents = dataset['data'] # is a list of documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"texts = []\n", | |
"for document in documents:\n", | |
" # lower case all words\n", | |
" lowered = document.lower()\n", | |
" #remove punctuation and split into seperate words\n", | |
" words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n", | |
" texts.append(words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dictionary = Dictionary(texts)\n", | |
"corpus = [dictionary.doc2bow(text) for text in texts]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"18846\n", | |
"Dictionary(173771 unique tokens: [u'3ds2scn', u'25599', u'diagnositic', u'9l2t', u'l1tbk']...)\n" | |
] | |
} | |
], | |
"source": [ | |
"print len(documents)\n", | |
"print dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"topics = [] # list of 100 topics\n", | |
"for l in open('/home/devashish/datasets/20NG/topics20NG.txt'):\n", | |
" topics.append([l.split()])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"human_scores = []\n", | |
"for l in open('/home/devashish/datasets/20NG/gold20NG.txt'):\n", | |
" human_scores.append(float(l.strip()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n", | |
"2\n", | |
"3\n", | |
"4\n", | |
"5\n", | |
"6\n", | |
"7\n", | |
"8\n", | |
"9\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"14\n", | |
"15\n", | |
"16\n", | |
"17\n", | |
"18\n", | |
"19\n", | |
"20\n", | |
"21\n", | |
"22\n", | |
"23\n", | |
"24\n", | |
"25\n", | |
"26\n", | |
"27\n", | |
"28\n", | |
"29\n", | |
"30\n", | |
"31\n", | |
"32\n", | |
"33\n", | |
"34\n", | |
"35\n", | |
"36\n", | |
"37\n", | |
"38\n", | |
"39\n", | |
"40\n", | |
"41\n", | |
"42\n", | |
"43\n", | |
"44\n", | |
"45\n", | |
"46\n", | |
"47\n", | |
"48\n", | |
"49\n", | |
"50\n", | |
"51\n", | |
"52\n", | |
"53\n", | |
"54\n", | |
"55\n", | |
"56\n", | |
"57\n", | |
"58\n", | |
"59\n", | |
"60\n", | |
"61\n", | |
"62\n", | |
"63\n", | |
"64\n", | |
"65\n", | |
"66\n", | |
"67\n", | |
"68\n", | |
"69\n", | |
"70\n", | |
"71\n", | |
"72\n", | |
"73\n", | |
"74\n", | |
"75\n", | |
"76\n", | |
"77\n", | |
"78\n", | |
"79\n", | |
"80\n", | |
"81\n", | |
"82\n", | |
"83\n", | |
"84\n", | |
"85\n", | |
"86\n", | |
"87\n", | |
"88\n", | |
"89\n", | |
"90\n", | |
"91\n", | |
"92\n", | |
"93\n", | |
"94\n", | |
"95\n", | |
"96\n", | |
"97\n", | |
"98\n", | |
"99\n", | |
"Time taken: 0:06:30.513886\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"u_mass = []\n", | |
"flags = []\n", | |
"for n, topic in enumerate(topics):\n", | |
" print n # for personal monitoring purposes. sorry for this\n", | |
" try:\n", | |
" cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n", | |
" u_mass.append(cm.get_coherence())\n", | |
" except KeyError:\n", | |
" flags.append(n)\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n", | |
"2\n", | |
"3\n", | |
"4\n", | |
"5\n", | |
"6\n", | |
"7\n", | |
"8\n", | |
"9\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"14\n", | |
"15\n", | |
"16\n", | |
"17\n", | |
"18\n", | |
"19\n", | |
"20\n", | |
"21\n", | |
"22\n", | |
"23\n", | |
"24\n", | |
"25\n", | |
"26\n", | |
"27\n", | |
"28\n", | |
"29\n", | |
"30\n", | |
"31\n", | |
"32\n", | |
"33\n", | |
"34\n", | |
"35\n", | |
"36\n", | |
"37\n", | |
"38\n", | |
"39\n", | |
"40\n", | |
"41\n", | |
"42\n", | |
"43\n", | |
"44\n", | |
"45\n", | |
"46\n", | |
"47\n", | |
"48\n", | |
"49\n", | |
"50\n", | |
"51\n", | |
"52\n", | |
"53\n", | |
"54\n", | |
"55\n", | |
"56\n", | |
"57\n", | |
"58\n", | |
"59\n", | |
"60\n", | |
"61\n", | |
"62\n", | |
"63\n", | |
"64\n", | |
"65\n", | |
"66\n", | |
"67\n", | |
"68\n", | |
"69\n", | |
"70\n", | |
"71\n", | |
"72\n", | |
"73\n", | |
"74\n", | |
"75\n", | |
"76\n", | |
"77\n", | |
"78\n", | |
"79\n", | |
"80\n", | |
"81\n", | |
"82\n", | |
"83\n", | |
"84\n", | |
"85\n", | |
"86\n", | |
"87\n", | |
"88\n", | |
"89\n", | |
"90\n", | |
"91\n", | |
"92\n", | |
"93\n", | |
"94\n", | |
"95\n", | |
"96\n", | |
"97\n", | |
"98\n", | |
"99\n", | |
"Time taken: 1:29:53.612739\n" | |
] | |
} | |
], | |
"source": [ | |
"start = datetime.now()\n", | |
"c_v = []\n", | |
"for n, topic in enumerate(topics):\n", | |
" print n # for personal monitoring purposes. sorry for this\n", | |
" try:\n", | |
" cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n", | |
" c_v.append(cm.get_coherence())\n", | |
" except KeyError:\n", | |
" pass\n", | |
"end = datetime.now()\n", | |
"print \"Time taken: %s\" % (end - start)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"final_scores = []\n", | |
"for n, score in enumerate(human_scores):\n", | |
" if n not in flags:\n", | |
" final_scores.append(score)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"99 99 99\n" | |
] | |
} | |
], | |
"source": [ | |
"print len(u_mass), len(c_v), len(final_scores)\n", | |
"# 1 topic has word(s) that is not in the dictionary. Probably some difference\n", | |
"# in preprocessing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.465570184541\n", | |
"0.45335681789\n" | |
] | |
} | |
], | |
"source": [ | |
"print pearsonr(u_mass, final_scores)[0]\n", | |
"print pearsonr(c_v, final_scores)[0]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment