Last active
July 30, 2016 14:42
-
-
Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.
SQLite spellfix1 with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"con = sqlite3.connect(':memory:')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(None,)]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.enable_load_extension(True)\n", | |
"con.execute('select load_extension(\"spellfix.so\")').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.executescript('''\n", | |
"CREATE VIRTUAL TABLE demo USING spellfix1;\n", | |
"CREATE VIRTUAL TABLE words USING fts4(word);\n", | |
"''').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<sqlite3.Cursor at 0x7f30840d01f0>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.executemany(\"INSERT INTO words VALUES(?)\",\n", | |
" [['あいうえお'], ['あえいおう'], ['かきくけこ'],\n", | |
" ['AIUEO'], ['KAKIKUKEKO'],\n", | |
" ['kennesaw'], ['kenosha'], ['kenesaw'], ['kenaga'], ['keanak']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute('INSERT INTO demo(word) SELECT word FROM words').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('あいうえお', 1, 4, 0, 35, 5),\n", | |
" ('あえいおう', 1, 4, 0, 35, 5),\n", | |
" ('かきくけこ', 1, 4, 0, 35, 5),\n", | |
" ('AIUEO', 1, 107, 0, 138, 5),\n", | |
" ('keanak', 1, 198, 0, 229, 6),\n", | |
" ('kenosha', 1, 220, 0, 251, 7),\n", | |
" ('kenaga', 1, 220, 0, 251, 6),\n", | |
" ('kenesaw', 1, 245, 0, 276, 7),\n", | |
" ('kennesaw', 1, 247, 0, 278, 8),\n", | |
" ('KAKIKUKEKO', 1, 320, 0, 351, 10)]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'あいう'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(1, 1, 0, 'あいうえお', '?????', ''),\n", | |
" (2, 1, 0, 'あえいおう', '?????', ''),\n", | |
" (3, 1, 0, 'かきくけこ', '?????', ''),\n", | |
" (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n", | |
" (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n", | |
" (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n", | |
" (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n", | |
" (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n", | |
" (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n", | |
" (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute('SELECT * FROM demo_vocab').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import igo\n", | |
"import csv\n", | |
"from io import StringIO\n", | |
"import unicodedata\n", | |
"tagger = igo.tagger.Tagger()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def maketrans(s):\n", | |
" # KATAKANA LETTER [A]\n", | |
" return str.maketrans({c:unicodedata.name(c).split()[2] for c in s})\n", | |
"\n", | |
"trans = maketrans('アイウエオカキクケコ')\n", | |
"def my_spellfix1_translit(w):\n", | |
" lines = StringIO()\n", | |
" for m in tagger.parse(w):\n", | |
" print(\"{},{}\".format(m.surface, m.feature), file=lines)\n", | |
" lines.seek(0)\n", | |
" reading = ''.join(x[8] if len(x) > 9 else x[0] for x in csv.reader(lines) if x)\n", | |
" return reading.translate(trans)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"con.create_function('spellfix1_translit', 1, my_spellfix1_translit)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[]" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute('UPDATE demo_vocab SET k1 = lower(spellfix1_translit(word)), k2=spellfix1_phonehash(lower(spellfix1_translit(word)))').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(1, 1, 0, 'あいうえお', 'aiueo', 'A'),\n", | |
" (2, 1, 0, 'あえいおう', 'aeiou', 'A'),\n", | |
" (3, 1, 0, 'かきくけこ', 'kakikukeko', 'CACACACACA'),\n", | |
" (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n", | |
" (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n", | |
" (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n", | |
" (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n", | |
" (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n", | |
" (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n", | |
" (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute('SELECT * FROM demo_vocab').fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('あいうえお', 1, 87, 0, 118, 5),\n", | |
" ('あえいおう', 1, 87, 0, 118, 5),\n", | |
" ('AIUEO', 1, 87, 0, 118, 5),\n", | |
" ('keanak', 1, 178, 0, 209, 6),\n", | |
" ('kenosha', 1, 200, 0, 231, 7),\n", | |
" ('kenaga', 1, 200, 0, 231, 6),\n", | |
" ('kenesaw', 1, 225, 0, 256, 7),\n", | |
" ('kennesaw', 1, 227, 0, 258, 8),\n", | |
" ('かきくけこ', 1, 300, 0, 331, 5),\n", | |
" ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ア'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('あいうえお', 1, 87, 0, 118, 5),\n", | |
" ('あえいおう', 1, 87, 0, 118, 5),\n", | |
" ('AIUEO', 1, 87, 0, 118, 5),\n", | |
" ('keanak', 1, 178, 0, 209, 6),\n", | |
" ('kenosha', 1, 200, 0, 231, 7),\n", | |
" ('kenaga', 1, 200, 0, 231, 6),\n", | |
" ('kenesaw', 1, 225, 0, 256, 7),\n", | |
" ('kennesaw', 1, 227, 0, 258, 8),\n", | |
" ('かきくけこ', 1, 300, 0, 331, 5),\n", | |
" ('KAKIKUKEKO', 1, 300, 0, 331, 10)]" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'あ'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('あいうえお', 1, 12, 0, 43, 5),\n", | |
" ('あえいおう', 1, 12, 0, 43, 5),\n", | |
" ('AIUEO', 1, 12, 0, 43, 5)]" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'A'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('あえいおう', 1, 51, 0, 82, 5),\n", | |
" ('あいうえお', 1, 52, 0, 83, 5),\n", | |
" ('AIUEO', 1, 52, 0, 83, 5)]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'e'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('keanak', 1, 158, 0, 189, 6),\n", | |
" ('kenosha', 1, 180, 0, 211, 7),\n", | |
" ('kenaga', 1, 180, 0, 211, 6),\n", | |
" ('kenesaw', 1, 205, 0, 236, 7),\n", | |
" ('kennesaw', 1, 207, 0, 238, 8),\n", | |
" ('かきくけこ', 1, 240, 0, 271, 5),\n", | |
" ('KAKIKUKEKO', 1, 240, 0, 271, 10)]" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ca'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('keanak', 1, 117, 0, 148, 6),\n", | |
" ('kenosha', 1, 140, 0, 171, 7),\n", | |
" ('kenaga', 1, 140, 0, 171, 6),\n", | |
" ('kenesaw', 1, 165, 0, 196, 7),\n", | |
" ('kennesaw', 1, 167, 0, 198, 8),\n", | |
" ('かきくけこ', 1, 200, 0, 231, 5),\n", | |
" ('KAKIKUKEKO', 1, 200, 0, 231, 10)]" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ka'\").fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment