Last active
November 20, 2019 09:17
-
-
Save vochicong/e9ec244e7b4caa4fdc27f5afe231c814 to your computer and use it in GitHub Desktop.
Japanese NLP with janome/spaCy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 日本語NLP@janome/spaCy/Python\n", | |
"\n", | |
"## Installation\n", | |
"\n", | |
"```bash:install-japanese-nlp-spacy.sh\n", | |
"virtualenv .env\n", | |
"source .env/bin/activate\n", | |
"pip install -U janome jupyter \n", | |
"pip install -U Cython git+https://github.com/explosion/spaCy\n", | |
"pip freeze > .env/requirements.txt\n", | |
"```\n", | |
"\n", | |
"`pip spacy`だとちょっと古く、日本語サポート`spacy.ja`が入っていなかったので、上記のように`pip install -U git+https://github.com/explosion/spaCy`で最新版をダウンロードし、ビルドしましょう。Cloud9が約30分ビルドに励んでくれました。\n", | |
"\n", | |
"## Run Jupyter Notebook on Cloud9 IDE\n", | |
"spaCyが無事インストールできたら、Jupyter Notebookを起動しましょう。\n", | |
"Cloud9なら:\n", | |
"\n", | |
"```sh:start-jupyter-c9.sh\n", | |
"jupyter notebook --port $PORT --ip $IP --no-browser\n", | |
"```\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from janome.tokenizer import Tokenizer\n", | |
"tokenizer = Tokenizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def tokenize(text):\n", | |
" for token in tokenizer.tokenize(text):\n", | |
" print(token)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"text1 = 'すもももももももものうち。'\n", | |
"text2 = '庭には鶏が2羽いる。'\n", | |
"text3 = 'にわにはニワトリがにわいる。'\n", | |
"texts = text1 + text2 + text3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ\n", | |
"も\t助詞,係助詞,*,*,*,*,も,モ,モ\n", | |
"もも\t名詞,一般,*,*,*,*,もも,モモ,モモ\n", | |
"も\t助詞,係助詞,*,*,*,*,も,モ,モ\n", | |
"もも\t名詞,一般,*,*,*,*,もも,モモ,モモ\n", | |
"の\t助詞,連体化,*,*,*,*,の,ノ,ノ\n", | |
"うち\t名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ\n", | |
"。\t記号,句点,*,*,*,*,。,。,。\n" | |
] | |
} | |
], | |
"source": [ | |
"tokenize(text1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"庭\t名詞,一般,*,*,*,*,庭,ニワ,ニワ\n", | |
"に\t助詞,格助詞,一般,*,*,*,に,ニ,ニ\n", | |
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n", | |
"鶏\t名詞,一般,*,*,*,*,鶏,ニワトリ,ニワトリ\n", | |
"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ\n", | |
"2\t名詞,数,*,*,*,*,*,*,*\n", | |
"羽\t名詞,接尾,助数詞,*,*,*,羽,ワ,ワ\n", | |
"いる\t動詞,自立,*,*,一段,基本形,いる,イル,イル\n", | |
"。\t記号,句点,*,*,*,*,。,。,。\n" | |
] | |
} | |
], | |
"source": [ | |
"tokenize(text2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"に\t助詞,格助詞,一般,*,*,*,に,ニ,ニ\n", | |
"わに\t名詞,一般,*,*,*,*,わに,ワニ,ワニ\n", | |
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n", | |
"ニワトリ\t名詞,一般,*,*,*,*,ニワトリ,ニワトリ,ニワトリ\n", | |
"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ\n", | |
"に\t助詞,格助詞,一般,*,*,*,に,ニ,ニ\n", | |
"わい\t動詞,自立,*,*,五段・カ行イ音便,連用タ接続,わく,ワイ,ワイ\n", | |
"る\t動詞,非自立,*,*,一段,基本形,る,ル,ル\n", | |
"。\t記号,句点,*,*,*,*,。,。,。\n" | |
] | |
} | |
], | |
"source": [ | |
"tokenize(text3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"に\t助詞,格助詞,一般,*,*,*,に,ニ,ニ\n", | |
"わに\t名詞,一般,*,*,*,*,わに,ワニ,ワニ\n", | |
"はにわ\t名詞,一般,*,*,*,*,はにわ,ハニワ,ハニワ\n", | |
"にわとり\t名詞,一般,*,*,*,*,にわとり,ニワトリ,ニワトリ\n", | |
"が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ\n", | |
"いる\t動詞,自立,*,*,一段,基本形,いる,イル,イル\n" | |
] | |
} | |
], | |
"source": [ | |
"tokenize('にわにはにわにわとりがいる')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from spacy.ja import Japanese\n", | |
"parser = Japanese()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def print_token(token):\n", | |
" print(\"==========================\")\n", | |
" print(\"value:\",token.orth_)\n", | |
" print(\"lemma:\",token.lemma_) # lemma is the root of a word\n", | |
" print(\"shape:\",token.shape_) # shape is capitalization and punctuation\n", | |
"\n", | |
"def spacy_parse(text):\n", | |
" tokens = parser(text)\n", | |
" tokens_orth = [token.orth_ for token in tokens]\n", | |
" print(tokens_orth)\n", | |
" for token in tokens:\n", | |
" print_token(token)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち', '。']\n", | |
"==========================\n", | |
"value: すもも\n", | |
"lemma: \n", | |
"shape: xxx\n", | |
"==========================\n", | |
"value: も\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: もも\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: も\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: もも\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: の\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: うち\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n" | |
] | |
} | |
], | |
"source": [ | |
"spacy_parse(text1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['庭', 'に', 'は', '鶏', 'が', '2', '羽', 'いる', '。']\n", | |
"==========================\n", | |
"value: 庭\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: は\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 鶏\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: が\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 2\n", | |
"lemma: \n", | |
"shape: d\n", | |
"==========================\n", | |
"value: 羽\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: いる\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n" | |
] | |
} | |
], | |
"source": [ | |
"spacy_parse(text2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['に', 'わに', 'は', 'ニワトリ', 'が', 'に', 'わい', 'る', '。']\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: わに\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: は\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: ニワトリ\n", | |
"lemma: \n", | |
"shape: xxxx\n", | |
"==========================\n", | |
"value: が\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: わい\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: る\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n" | |
] | |
} | |
], | |
"source": [ | |
"spacy_parse(text3) #イマイチ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち', '。', '庭', 'に', 'は', '鶏', 'が', '2', '羽', 'いる', '。', 'に', 'わに', 'は', 'ニワトリ', 'が', 'に', 'わい', 'る', '。']\n", | |
"==========================\n", | |
"value: すもも\n", | |
"lemma: \n", | |
"shape: xxx\n", | |
"==========================\n", | |
"value: も\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: もも\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: も\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: もも\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: の\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: うち\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n", | |
"==========================\n", | |
"value: 庭\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: は\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 鶏\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: が\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 2\n", | |
"lemma: \n", | |
"shape: d\n", | |
"==========================\n", | |
"value: 羽\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: いる\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: わに\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: は\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: ニワトリ\n", | |
"lemma: \n", | |
"shape: xxxx\n", | |
"==========================\n", | |
"value: が\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: に\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: わい\n", | |
"lemma: \n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: る\n", | |
"lemma: \n", | |
"shape: x\n", | |
"==========================\n", | |
"value: 。\n", | |
"lemma: \n", | |
"shape: 。\n" | |
] | |
} | |
], | |
"source": [ | |
"spacy_parse(texts)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## NLTK?\n", | |
"\n", | |
"NLTKも有力らしいですが、まだ試していません。\n", | |
"\n", | |
"```sh:install-nlp-nltk-python.sh\n", | |
"pip install -U nltk requests\n", | |
"```\n", | |
"\n", | |
"## Links\n", | |
"\n", | |
"- [explosion/spaCy, NLP with Python and Cython](https://github.com/explosion/spaCy)\n", | |
"- [mocobeta/janome, Japanese morphological analysis engine](https://github.com/mocobeta/janome)\n", | |
"- [nbviewer, a simple way to share Jupyter Notebooks](http://nbviewer.jupyter.org/)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter notebook --port $PORT --ip $IP --no-browser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment