Created
December 27, 2018 09:41
-
-
Save yagays/824449ec689b9da7751c0738e8d57d2a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pickle\n", | |
"\n", | |
"from gensim.models import KeyedVectors\n", | |
"import marisa_trie" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = KeyedVectors.load(\"jawiki.all_vectors.200d.bin\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 語彙数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1463528" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(model.vocab)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## dictでid2wordを保存する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"id2word = {i:word for i, word in enumerate(model.vocab)}\n", | |
"word2id = {v: k for k, v in id2word.items()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(\"id2word.pkl\", \"wb\") as f:\n", | |
" pickle.dump(id2word, f)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## marisa-trieでid2wordを保存する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trie = marisa_trie.Trie(model.vocab)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trie.save('id2word.marisa')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 比較" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 比較1: ファイルサイズ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 44M\tid2word.pkl\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!du -h id2word.pkl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"7.8M\tid2word.marisa\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!du -h id2word.marisa" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 比較2: ファイルの読み込み速度" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"363 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"with open(\"id2word.pkl\", \"rb\") as f:\n", | |
" id2word2 = pickle.load(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.6 ms ± 72.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"trie2 = marisa_trie.Trie()\n", | |
"trie2.load('id2word.marisa')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 比較3: id -> wordの速度" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"33 ns ± 0.788 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit id2word[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"362 ns ± 8.78 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit trie.restore_key(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"46 ns ± 0.961 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit id2word[1463527]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.39 µs ± 61.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit trie.restore_key(1463527)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 比較4: word -> idの速度" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"30.9 ns ± 0.788 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit word2id[\"[\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"126 ns ± 1.64 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit trie[\"[\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"44.3 ns ± 2.32 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit word2id[\"[昼夜開講]\"] # word_id: 1463527" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.09 µs ± 145 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit trie[\"━━━━━━━━━━━━━━━━━━━━━━━━┛\"] # word_id:1463527" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment