Last active
November 20, 2019 09:17
-
-
Save vochicong/e9ec244e7b4caa4fdc27f5afe231c814 to your computer and use it in GitHub Desktop.
Japanese NLP with janome/spaCy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import spacy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'a',\n", | |
" 'about',\n", | |
" 'above',\n", | |
" 'across',\n", | |
" 'after',\n", | |
" 'afterwards',\n", | |
" 'again',\n", | |
" 'against',\n", | |
" 'all',\n", | |
" 'almost',\n", | |
" 'alone',\n", | |
" 'along',\n", | |
" 'already',\n", | |
" 'also',\n", | |
" 'although',\n", | |
" 'always',\n", | |
" 'am',\n", | |
" 'among',\n", | |
" 'amongst',\n", | |
" 'amount',\n", | |
" 'an',\n", | |
" 'and',\n", | |
" 'another',\n", | |
" 'any',\n", | |
" 'anyhow',\n", | |
" 'anyone',\n", | |
" 'anything',\n", | |
" 'anyway',\n", | |
" 'anywhere',\n", | |
" 'are',\n", | |
" 'around',\n", | |
" 'as',\n", | |
" 'at',\n", | |
" 'back',\n", | |
" 'be',\n", | |
" 'became',\n", | |
" 'because',\n", | |
" 'become',\n", | |
" 'becomes',\n", | |
" 'becoming',\n", | |
" 'been',\n", | |
" 'before',\n", | |
" 'beforehand',\n", | |
" 'behind',\n", | |
" 'being',\n", | |
" 'below',\n", | |
" 'beside',\n", | |
" 'besides',\n", | |
" 'between',\n", | |
" 'beyond',\n", | |
" 'both',\n", | |
" 'bottom',\n", | |
" 'but',\n", | |
" 'by',\n", | |
" 'ca',\n", | |
" 'call',\n", | |
" 'can',\n", | |
" 'cannot',\n", | |
" 'could',\n", | |
" 'did',\n", | |
" 'do',\n", | |
" 'does',\n", | |
" 'doing',\n", | |
" 'done',\n", | |
" 'down',\n", | |
" 'due',\n", | |
" 'during',\n", | |
" 'each',\n", | |
" 'eight',\n", | |
" 'either',\n", | |
" 'eleven',\n", | |
" 'else',\n", | |
" 'elsewhere',\n", | |
" 'empty',\n", | |
" 'enough',\n", | |
" 'etc',\n", | |
" 'even',\n", | |
" 'ever',\n", | |
" 'every',\n", | |
" 'everyone',\n", | |
" 'everything',\n", | |
" 'everywhere',\n", | |
" 'except',\n", | |
" 'few',\n", | |
" 'fifteen',\n", | |
" 'fifty',\n", | |
" 'first',\n", | |
" 'five',\n", | |
" 'for',\n", | |
" 'former',\n", | |
" 'formerly',\n", | |
" 'forty',\n", | |
" 'four',\n", | |
" 'from',\n", | |
" 'front',\n", | |
" 'full',\n", | |
" 'further',\n", | |
" 'get',\n", | |
" 'give',\n", | |
" 'go',\n", | |
" 'had',\n", | |
" 'has',\n", | |
" 'have',\n", | |
" 'he',\n", | |
" 'hence',\n", | |
" 'her',\n", | |
" 'here',\n", | |
" 'hereafter',\n", | |
" 'hereby',\n", | |
" 'herein',\n", | |
" 'hereupon',\n", | |
" 'hers',\n", | |
" 'herself',\n", | |
" 'him',\n", | |
" 'himself',\n", | |
" 'his',\n", | |
" 'how',\n", | |
" 'however',\n", | |
" 'hundred',\n", | |
" 'i',\n", | |
" 'if',\n", | |
" 'in',\n", | |
" 'inc',\n", | |
" 'indeed',\n", | |
" 'into',\n", | |
" 'is',\n", | |
" 'it',\n", | |
" 'its',\n", | |
" 'itself',\n", | |
" 'just',\n", | |
" 'keep',\n", | |
" 'last',\n", | |
" 'latter',\n", | |
" 'latterly',\n", | |
" 'least',\n", | |
" 'less',\n", | |
" 'made',\n", | |
" 'make',\n", | |
" 'many',\n", | |
" 'may',\n", | |
" 'me',\n", | |
" 'meanwhile',\n", | |
" 'might',\n", | |
" 'mine',\n", | |
" 'more',\n", | |
" 'moreover',\n", | |
" 'most',\n", | |
" 'mostly',\n", | |
" 'move',\n", | |
" 'much',\n", | |
" 'must',\n", | |
" 'my',\n", | |
" 'myself',\n", | |
" 'name',\n", | |
" 'namely',\n", | |
" 'neither',\n", | |
" 'never',\n", | |
" 'nevertheless',\n", | |
" 'next',\n", | |
" 'nine',\n", | |
" 'no',\n", | |
" 'nobody',\n", | |
" 'none',\n", | |
" 'noone',\n", | |
" 'nor',\n", | |
" 'not',\n", | |
" 'nothing',\n", | |
" 'now',\n", | |
" 'nowhere',\n", | |
" 'of',\n", | |
" 'off',\n", | |
" 'often',\n", | |
" 'on',\n", | |
" 'once',\n", | |
" 'one',\n", | |
" 'only',\n", | |
" 'onto',\n", | |
" 'or',\n", | |
" 'other',\n", | |
" 'others',\n", | |
" 'otherwise',\n", | |
" 'our',\n", | |
" 'ours',\n", | |
" 'ourselves',\n", | |
" 'out',\n", | |
" 'over',\n", | |
" 'own',\n", | |
" 'part',\n", | |
" 'per',\n", | |
" 'perhaps',\n", | |
" 'please',\n", | |
" 'put',\n", | |
" 'quite',\n", | |
" 'rather',\n", | |
" 're',\n", | |
" 'really',\n", | |
" 'regarding',\n", | |
" 'same',\n", | |
" 'say',\n", | |
" 'see',\n", | |
" 'seem',\n", | |
" 'seemed',\n", | |
" 'seeming',\n", | |
" 'seems',\n", | |
" 'serious',\n", | |
" 'several',\n", | |
" 'she',\n", | |
" 'should',\n", | |
" 'show',\n", | |
" 'side',\n", | |
" 'since',\n", | |
" 'six',\n", | |
" 'sixty',\n", | |
" 'so',\n", | |
" 'some',\n", | |
" 'somehow',\n", | |
" 'someone',\n", | |
" 'something',\n", | |
" 'sometime',\n", | |
" 'sometimes',\n", | |
" 'somewhere',\n", | |
" 'still',\n", | |
" 'such',\n", | |
" 'take',\n", | |
" 'ten',\n", | |
" 'than',\n", | |
" 'that',\n", | |
" 'the',\n", | |
" 'their',\n", | |
" 'them',\n", | |
" 'themselves',\n", | |
" 'then',\n", | |
" 'thence',\n", | |
" 'there',\n", | |
" 'thereafter',\n", | |
" 'thereby',\n", | |
" 'therefore',\n", | |
" 'therein',\n", | |
" 'thereupon',\n", | |
" 'these',\n", | |
" 'they',\n", | |
" 'third',\n", | |
" 'this',\n", | |
" 'those',\n", | |
" 'though',\n", | |
" 'three',\n", | |
" 'through',\n", | |
" 'throughout',\n", | |
" 'thru',\n", | |
" 'thus',\n", | |
" 'to',\n", | |
" 'together',\n", | |
" 'too',\n", | |
" 'top',\n", | |
" 'toward',\n", | |
" 'towards',\n", | |
" 'twelve',\n", | |
" 'twenty',\n", | |
" 'two',\n", | |
" 'under',\n", | |
" 'unless',\n", | |
" 'until',\n", | |
" 'up',\n", | |
" 'upon',\n", | |
" 'us',\n", | |
" 'used',\n", | |
" 'using',\n", | |
" 'various',\n", | |
" 'very',\n", | |
" 'via',\n", | |
" 'was',\n", | |
" 'we',\n", | |
" 'well',\n", | |
" 'were',\n", | |
" 'what',\n", | |
" 'whatever',\n", | |
" 'when',\n", | |
" 'whence',\n", | |
" 'whenever',\n", | |
" 'where',\n", | |
" 'whereafter',\n", | |
" 'whereas',\n", | |
" 'whereby',\n", | |
" 'wherein',\n", | |
" 'whereupon',\n", | |
" 'wherever',\n", | |
" 'whether',\n", | |
" 'which',\n", | |
" 'while',\n", | |
" 'whither',\n", | |
" 'who',\n", | |
" 'whoever',\n", | |
" 'whole',\n", | |
" 'whom',\n", | |
" 'whose',\n", | |
" 'why',\n", | |
" 'will',\n", | |
" 'with',\n", | |
" 'within',\n", | |
" 'without',\n", | |
" 'would',\n", | |
" 'yet',\n", | |
" 'you',\n", | |
" 'your',\n", | |
" 'yours',\n", | |
" 'yourself',\n", | |
" 'yourselves'}" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spacy.en.STOP_WORDS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'、', '。'}" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spacy.ja.STOP_WORDS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def print_token(token):\n", | |
" print(\"==========================\")\n", | |
" print(\"value:\",token.orth_)\n", | |
" print(\"lemma:\",token.lemma_) # lemma is the root of a word\n", | |
" print(\"shape:\",token.shape_) # shape is capitalization and punctuation\n", | |
"\n", | |
"def print_sents(sents):\n", | |
" for sent in sents:\n", | |
" print(\"Sentence:\")\n", | |
" print(sent)\n", | |
" print()\n", | |
"\n", | |
"def parse(text):\n", | |
" tokens = parser(text)\n", | |
" print_sents(tokens.sents)\n", | |
" tokens_orth = [token.orth_ for token in tokens]\n", | |
" print(tokens_orth)\n", | |
" for token in tokens:\n", | |
" print_token(token)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"英語モデルをダウンロード。\n", | |
"\n", | |
"```\n", | |
"$ python -m spacy download en\n", | |
"\n", | |
" Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n", | |
"\n", | |
"Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n", | |
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)\n", | |
" 100% |████████████████████████████████| 52.2MB 411kB/s \n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sentence:\n", | |
"I'm Mr. Cong.\n", | |
"\n", | |
"Sentence:\n", | |
"Dr. Duc is coming.\n", | |
"\n", | |
"Sentence:\n", | |
"Ph.D. Viet is the man overthere.\n", | |
"\n", | |
"['I', \"'m\", 'Mr.', 'Cong', '.', 'Dr.', 'Duc', 'is', 'coming', '.', 'Ph.D.', 'Viet', 'is', 'the', 'man', 'overthere', '.']\n", | |
"==========================\n", | |
"value: I\n", | |
"lemma: -PRON-\n", | |
"shape: X\n", | |
"==========================\n", | |
"value: 'm\n", | |
"lemma: be\n", | |
"shape: 'x\n", | |
"==========================\n", | |
"value: Mr.\n", | |
"lemma: mr.\n", | |
"shape: Xx.\n", | |
"==========================\n", | |
"value: Cong\n", | |
"lemma: cong\n", | |
"shape: Xxxx\n", | |
"==========================\n", | |
"value: .\n", | |
"lemma: .\n", | |
"shape: .\n", | |
"==========================\n", | |
"value: Dr.\n", | |
"lemma: dr.\n", | |
"shape: Xx.\n", | |
"==========================\n", | |
"value: Duc\n", | |
"lemma: duc\n", | |
"shape: Xxx\n", | |
"==========================\n", | |
"value: is\n", | |
"lemma: be\n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: coming\n", | |
"lemma: come\n", | |
"shape: xxxx\n", | |
"==========================\n", | |
"value: .\n", | |
"lemma: .\n", | |
"shape: .\n", | |
"==========================\n", | |
"value: Ph.D.\n", | |
"lemma: ph.d.\n", | |
"shape: Xx.X.\n", | |
"==========================\n", | |
"value: Viet\n", | |
"lemma: viet\n", | |
"shape: Xxxx\n", | |
"==========================\n", | |
"value: is\n", | |
"lemma: be\n", | |
"shape: xx\n", | |
"==========================\n", | |
"value: the\n", | |
"lemma: the\n", | |
"shape: xxx\n", | |
"==========================\n", | |
"value: man\n", | |
"lemma: man\n", | |
"shape: xxx\n", | |
"==========================\n", | |
"value: overthere\n", | |
"lemma: overthere\n", | |
"shape: xxxx\n", | |
"==========================\n", | |
"value: .\n", | |
"lemma: .\n", | |
"shape: .\n" | |
] | |
} | |
], | |
"source": [ | |
"parser = spacy.en.English()\n", | |
"parse(\"I'm Mr. Cong. Dr. Duc is coming. Ph.D. Viet is the man overthere.\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"日本語モデルはあるのかな\n", | |
"\n", | |
"```\n", | |
"$ python -m spacy download ja\n", | |
"\n", | |
" Compatibility error\n", | |
"\n", | |
" No compatible model found for 'ja' (spaCy v1.8.2).\n", | |
"```\n", | |
"\n", | |
"まだないですね。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-6-47928386adc1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mja\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mJapanese\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mtokens_orth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0morth_\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens_orth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mprint_sents\u001b[0;34m(sents)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msents\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sentence:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/home/ubuntu/workspace/nlp-python/.env/lib/python3.4/site-packages/spacy/tokens/doc.pyx\u001b[0m in \u001b[0;36m__get__ (spacy/tokens/doc.cpp:10140)\u001b[0;34m()\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_parsed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 438\u001b[0m \u001b[0;34m\"Sentence boundary detection requires the dependency parse, which \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;34m\"requires data to be installed. For more info, see the \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n" | |
] | |
} | |
], | |
"source": [ | |
"parser = spacy.ja.Japanese()\n", | |
"parse(\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"つまり、日本語モデルがないため、spaCyではドキュメントを文ごとに切ることができないのね。\n", | |
"言語モデルを追加する方法:\n", | |
"https://spacy.io/docs/usage/adding-languages" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter notebook --port $PORT --ip $IP --no-browser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment