Created
July 20, 2013 20:03
-
-
Save vchahun/6046246 to your computer and use it in GitHub Desktop.
Unsupervised word classes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "word-classes" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import sys\nsys.path.append('/home/vchahune/tools/ruslem/env/lib/python2.7/site-packages/')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from pymorphy2.tagger import Morph\ntagger = Morph.load('/home/vchahune/tools/ruslem/dict')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import re\nimport io\nfrom collections import Counter", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from IPython.display import HTML", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "confusion = {l: Counter() for l in 'ABC'}", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "with io.open('/usr0/home/cdyer/projects/cpyp/lozenge/3-types.sorted.txt', encoding='utf8') as f:\n for line in f:\n infered_cls, word = line[:-1].split(':')\n true_cls = tagger.parse(re.sub('[ \\^><]', '', word))[0][1].cls\n confusion[infered_cls][true_cls] += 1", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "rows = 'ABC'\ncols = sorted(set(key for c in confusion.values() for key in c.keys()))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "header = ''.join('<th>{}</th>'.format(col) for col in ['']+cols)\ntable_rows = '\\n'.join('<tr><th>{}</th>{}</tr>'.format(row, ''.join('<td>{}</td>'.format(confusion[row][col]) for col in cols)) for row in rows)\nHTML(\"\"\"\n<table>\n<tr>{}</tr>\n{}\n</table>\n\"\"\".format(header, table_rows))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "\n<table>\n<tr><th></th><th>ADJF</th><th>ADJS</th><th>ADVB</th><th>COMP</th><th>CONJ</th><th>GRND</th><th>INFN</th><th>INTJ</th><th>NOUN</th><th>NPRO</th><th>NUMR</th><th>PRCL</th><th>PRED</th><th>PREP</th><th>PRTF</th><th>PRTS</th><th>VERB</th></tr>\n<tr><th>A</th><td>13147</td><td>554</td><td>432</td><td>69</td><td>10</td><td>1027</td><td>1902</td><td>2</td><td>3761</td><td>10</td><td>11</td><td>2</td><td>4</td><td>3</td><td>7137</td><td>436</td><td>8936</td></tr>\n<tr><th>B</th><td>11101</td><td>908</td><td>333</td><td>48</td><td>7</td><td>29</td><td>12</td><td>2</td><td>18967</td><td>3</td><td>4</td><td>3</td><td>3</td><td>5</td><td>2192</td><td>1155</td><td>223</td></tr>\n<tr><th>C</th><td>7673</td><td>490</td><td>503</td><td>75</td><td>31</td><td>828</td><td>2349</td><td>6</td><td>19477</td><td>30</td><td>103</td><td>8</td><td>10</td><td>24</td><td>3908</td><td>743</td><td>7531</td></tr>\n</table>\n", | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"text": "<IPython.core.display.HTML at 0x41fb390>" | |
} | |
], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment