Skip to content

Instantly share code, notes, and snippets.

@hideaki-t
Last active July 30, 2016 14:42
Show Gist options
  • Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.
Save hideaki-t/81a94ef1e0895a97e7cc4fdf9250141a to your computer and use it in GitHub Desktop.
SQLite spellfix1 with Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sqlite3"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con = sqlite3.connect(':memory:')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(None,)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.enable_load_extension(True)\n",
"con.execute('select load_extension(\"spellfix.so\")').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.executescript('''\n",
"CREATE VIRTUAL TABLE demo USING spellfix1;\n",
"CREATE VIRTUAL TABLE words USING fts4(word);\n",
"''').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<sqlite3.Cursor at 0x7f30840d01f0>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.executemany(\"INSERT INTO words VALUES(?)\",\n",
" [['あいうえお'], ['あえいおう'], ['かきくけこ'],\n",
" ['AIUEO'], ['KAKIKUKEKO'],\n",
" ['kennesaw'], ['kenosha'], ['kenesaw'], ['kenaga'], ['keanak']])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute('INSERT INTO demo(word) SELECT word FROM words').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('あいうえお', 1, 4, 0, 35, 5),\n",
" ('あえいおう', 1, 4, 0, 35, 5),\n",
" ('かきくけこ', 1, 4, 0, 35, 5),\n",
" ('AIUEO', 1, 107, 0, 138, 5),\n",
" ('keanak', 1, 198, 0, 229, 6),\n",
" ('kenosha', 1, 220, 0, 251, 7),\n",
" ('kenaga', 1, 220, 0, 251, 6),\n",
" ('kenesaw', 1, 245, 0, 276, 7),\n",
" ('kennesaw', 1, 247, 0, 278, 8),\n",
" ('KAKIKUKEKO', 1, 320, 0, 351, 10)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'あいう'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 1, 0, 'あいうえお', '?????', ''),\n",
" (2, 1, 0, 'あえいおう', '?????', ''),\n",
" (3, 1, 0, 'かきくけこ', '?????', ''),\n",
" (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n",
" (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n",
" (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n",
" (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n",
" (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n",
" (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n",
" (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute('SELECT * FROM demo_vocab').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import igo\n",
"import csv\n",
"from io import StringIO\n",
"import unicodedata\n",
"tagger = igo.tagger.Tagger()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def maketrans(s):\n",
" # KATAKANA LETTER [A]\n",
" return str.maketrans({c:unicodedata.name(c).split()[2] for c in s})\n",
"\n",
"trans = maketrans('アイウエオカキクケコ')\n",
"def my_spellfix1_translit(w):\n",
" lines = StringIO()\n",
" for m in tagger.parse(w):\n",
" print(\"{},{}\".format(m.surface, m.feature), file=lines)\n",
" lines.seek(0)\n",
" reading = ''.join(x[8] if len(x) > 9 else x[0] for x in csv.reader(lines) if x)\n",
" return reading.translate(trans)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"con.create_function('spellfix1_translit', 1, my_spellfix1_translit)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute('UPDATE demo_vocab SET k1 = lower(spellfix1_translit(word)), k2=spellfix1_phonehash(lower(spellfix1_translit(word)))').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 1, 0, 'あいうえお', 'aiueo', 'A'),\n",
" (2, 1, 0, 'あえいおう', 'aeiou', 'A'),\n",
" (3, 1, 0, 'かきくけこ', 'kakikukeko', 'CACACACACA'),\n",
" (4, 1, 0, 'AIUEO', 'aiueo', 'A'),\n",
" (5, 1, 0, 'KAKIKUKEKO', 'kakikukeko', 'CACACACACA'),\n",
" (6, 1, 0, 'kennesaw', 'kennesaw', 'CAMACAB'),\n",
" (7, 1, 0, 'kenosha', 'kenosha', 'CAMACA'),\n",
" (8, 1, 0, 'kenesaw', 'kenesaw', 'CAMACAB'),\n",
" (9, 1, 0, 'kenaga', 'kenaga', 'CAMACA'),\n",
" (10, 1, 0, 'keanak', 'keanak', 'CAMAC')]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute('SELECT * FROM demo_vocab').fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('あいうえお', 1, 87, 0, 118, 5),\n",
" ('あえいおう', 1, 87, 0, 118, 5),\n",
" ('AIUEO', 1, 87, 0, 118, 5),\n",
" ('keanak', 1, 178, 0, 209, 6),\n",
" ('kenosha', 1, 200, 0, 231, 7),\n",
" ('kenaga', 1, 200, 0, 231, 6),\n",
" ('kenesaw', 1, 225, 0, 256, 7),\n",
" ('kennesaw', 1, 227, 0, 258, 8),\n",
" ('かきくけこ', 1, 300, 0, 331, 5),\n",
" ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ア'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('あいうえお', 1, 87, 0, 118, 5),\n",
" ('あえいおう', 1, 87, 0, 118, 5),\n",
" ('AIUEO', 1, 87, 0, 118, 5),\n",
" ('keanak', 1, 178, 0, 209, 6),\n",
" ('kenosha', 1, 200, 0, 231, 7),\n",
" ('kenaga', 1, 200, 0, 231, 6),\n",
" ('kenesaw', 1, 225, 0, 256, 7),\n",
" ('kennesaw', 1, 227, 0, 258, 8),\n",
" ('かきくけこ', 1, 300, 0, 331, 5),\n",
" ('KAKIKUKEKO', 1, 300, 0, 331, 10)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'あ'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('あいうえお', 1, 12, 0, 43, 5),\n",
" ('あえいおう', 1, 12, 0, 43, 5),\n",
" ('AIUEO', 1, 12, 0, 43, 5)]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'A'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('あえいおう', 1, 51, 0, 82, 5),\n",
" ('あいうえお', 1, 52, 0, 83, 5),\n",
" ('AIUEO', 1, 52, 0, 83, 5)]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'e'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('keanak', 1, 158, 0, 189, 6),\n",
" ('kenosha', 1, 180, 0, 211, 7),\n",
" ('kenaga', 1, 180, 0, 211, 6),\n",
" ('kenesaw', 1, 205, 0, 236, 7),\n",
" ('kennesaw', 1, 207, 0, 238, 8),\n",
" ('かきくけこ', 1, 240, 0, 271, 5),\n",
" ('KAKIKUKEKO', 1, 240, 0, 271, 10)]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ca'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('keanak', 1, 117, 0, 148, 6),\n",
" ('kenosha', 1, 140, 0, 171, 7),\n",
" ('kenaga', 1, 140, 0, 171, 6),\n",
" ('kenesaw', 1, 165, 0, 196, 7),\n",
" ('kennesaw', 1, 167, 0, 198, 8),\n",
" ('かきくけこ', 1, 200, 0, 231, 5),\n",
" ('KAKIKUKEKO', 1, 200, 0, 231, 10)]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SELECT * FROM demo WHERE word MATCH 'ka'\").fetchall()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment