Skip to content

Instantly share code, notes, and snippets.

@vochicong
Last active November 20, 2019 09:17
Show Gist options
  • Save vochicong/e9ec244e7b4caa4fdc27f5afe231c814 to your computer and use it in GitHub Desktop.
Save vochicong/e9ec244e7b4caa4fdc27f5afe231c814 to your computer and use it in GitHub Desktop.
Japanese NLP with janome/spaCy
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'a',\n",
" 'about',\n",
" 'above',\n",
" 'across',\n",
" 'after',\n",
" 'afterwards',\n",
" 'again',\n",
" 'against',\n",
" 'all',\n",
" 'almost',\n",
" 'alone',\n",
" 'along',\n",
" 'already',\n",
" 'also',\n",
" 'although',\n",
" 'always',\n",
" 'am',\n",
" 'among',\n",
" 'amongst',\n",
" 'amount',\n",
" 'an',\n",
" 'and',\n",
" 'another',\n",
" 'any',\n",
" 'anyhow',\n",
" 'anyone',\n",
" 'anything',\n",
" 'anyway',\n",
" 'anywhere',\n",
" 'are',\n",
" 'around',\n",
" 'as',\n",
" 'at',\n",
" 'back',\n",
" 'be',\n",
" 'became',\n",
" 'because',\n",
" 'become',\n",
" 'becomes',\n",
" 'becoming',\n",
" 'been',\n",
" 'before',\n",
" 'beforehand',\n",
" 'behind',\n",
" 'being',\n",
" 'below',\n",
" 'beside',\n",
" 'besides',\n",
" 'between',\n",
" 'beyond',\n",
" 'both',\n",
" 'bottom',\n",
" 'but',\n",
" 'by',\n",
" 'ca',\n",
" 'call',\n",
" 'can',\n",
" 'cannot',\n",
" 'could',\n",
" 'did',\n",
" 'do',\n",
" 'does',\n",
" 'doing',\n",
" 'done',\n",
" 'down',\n",
" 'due',\n",
" 'during',\n",
" 'each',\n",
" 'eight',\n",
" 'either',\n",
" 'eleven',\n",
" 'else',\n",
" 'elsewhere',\n",
" 'empty',\n",
" 'enough',\n",
" 'etc',\n",
" 'even',\n",
" 'ever',\n",
" 'every',\n",
" 'everyone',\n",
" 'everything',\n",
" 'everywhere',\n",
" 'except',\n",
" 'few',\n",
" 'fifteen',\n",
" 'fifty',\n",
" 'first',\n",
" 'five',\n",
" 'for',\n",
" 'former',\n",
" 'formerly',\n",
" 'forty',\n",
" 'four',\n",
" 'from',\n",
" 'front',\n",
" 'full',\n",
" 'further',\n",
" 'get',\n",
" 'give',\n",
" 'go',\n",
" 'had',\n",
" 'has',\n",
" 'have',\n",
" 'he',\n",
" 'hence',\n",
" 'her',\n",
" 'here',\n",
" 'hereafter',\n",
" 'hereby',\n",
" 'herein',\n",
" 'hereupon',\n",
" 'hers',\n",
" 'herself',\n",
" 'him',\n",
" 'himself',\n",
" 'his',\n",
" 'how',\n",
" 'however',\n",
" 'hundred',\n",
" 'i',\n",
" 'if',\n",
" 'in',\n",
" 'inc',\n",
" 'indeed',\n",
" 'into',\n",
" 'is',\n",
" 'it',\n",
" 'its',\n",
" 'itself',\n",
" 'just',\n",
" 'keep',\n",
" 'last',\n",
" 'latter',\n",
" 'latterly',\n",
" 'least',\n",
" 'less',\n",
" 'made',\n",
" 'make',\n",
" 'many',\n",
" 'may',\n",
" 'me',\n",
" 'meanwhile',\n",
" 'might',\n",
" 'mine',\n",
" 'more',\n",
" 'moreover',\n",
" 'most',\n",
" 'mostly',\n",
" 'move',\n",
" 'much',\n",
" 'must',\n",
" 'my',\n",
" 'myself',\n",
" 'name',\n",
" 'namely',\n",
" 'neither',\n",
" 'never',\n",
" 'nevertheless',\n",
" 'next',\n",
" 'nine',\n",
" 'no',\n",
" 'nobody',\n",
" 'none',\n",
" 'noone',\n",
" 'nor',\n",
" 'not',\n",
" 'nothing',\n",
" 'now',\n",
" 'nowhere',\n",
" 'of',\n",
" 'off',\n",
" 'often',\n",
" 'on',\n",
" 'once',\n",
" 'one',\n",
" 'only',\n",
" 'onto',\n",
" 'or',\n",
" 'other',\n",
" 'others',\n",
" 'otherwise',\n",
" 'our',\n",
" 'ours',\n",
" 'ourselves',\n",
" 'out',\n",
" 'over',\n",
" 'own',\n",
" 'part',\n",
" 'per',\n",
" 'perhaps',\n",
" 'please',\n",
" 'put',\n",
" 'quite',\n",
" 'rather',\n",
" 're',\n",
" 'really',\n",
" 'regarding',\n",
" 'same',\n",
" 'say',\n",
" 'see',\n",
" 'seem',\n",
" 'seemed',\n",
" 'seeming',\n",
" 'seems',\n",
" 'serious',\n",
" 'several',\n",
" 'she',\n",
" 'should',\n",
" 'show',\n",
" 'side',\n",
" 'since',\n",
" 'six',\n",
" 'sixty',\n",
" 'so',\n",
" 'some',\n",
" 'somehow',\n",
" 'someone',\n",
" 'something',\n",
" 'sometime',\n",
" 'sometimes',\n",
" 'somewhere',\n",
" 'still',\n",
" 'such',\n",
" 'take',\n",
" 'ten',\n",
" 'than',\n",
" 'that',\n",
" 'the',\n",
" 'their',\n",
" 'them',\n",
" 'themselves',\n",
" 'then',\n",
" 'thence',\n",
" 'there',\n",
" 'thereafter',\n",
" 'thereby',\n",
" 'therefore',\n",
" 'therein',\n",
" 'thereupon',\n",
" 'these',\n",
" 'they',\n",
" 'third',\n",
" 'this',\n",
" 'those',\n",
" 'though',\n",
" 'three',\n",
" 'through',\n",
" 'throughout',\n",
" 'thru',\n",
" 'thus',\n",
" 'to',\n",
" 'together',\n",
" 'too',\n",
" 'top',\n",
" 'toward',\n",
" 'towards',\n",
" 'twelve',\n",
" 'twenty',\n",
" 'two',\n",
" 'under',\n",
" 'unless',\n",
" 'until',\n",
" 'up',\n",
" 'upon',\n",
" 'us',\n",
" 'used',\n",
" 'using',\n",
" 'various',\n",
" 'very',\n",
" 'via',\n",
" 'was',\n",
" 'we',\n",
" 'well',\n",
" 'were',\n",
" 'what',\n",
" 'whatever',\n",
" 'when',\n",
" 'whence',\n",
" 'whenever',\n",
" 'where',\n",
" 'whereafter',\n",
" 'whereas',\n",
" 'whereby',\n",
" 'wherein',\n",
" 'whereupon',\n",
" 'wherever',\n",
" 'whether',\n",
" 'which',\n",
" 'while',\n",
" 'whither',\n",
" 'who',\n",
" 'whoever',\n",
" 'whole',\n",
" 'whom',\n",
" 'whose',\n",
" 'why',\n",
" 'will',\n",
" 'with',\n",
" 'within',\n",
" 'without',\n",
" 'would',\n",
" 'yet',\n",
" 'you',\n",
" 'your',\n",
" 'yours',\n",
" 'yourself',\n",
" 'yourselves'}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spacy.en.STOP_WORDS"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'、', '。'}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spacy.ja.STOP_WORDS"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def print_token(token):\n",
" print(\"==========================\")\n",
" print(\"value:\",token.orth_)\n",
" print(\"lemma:\",token.lemma_) # lemma is the root of a word\n",
" print(\"shape:\",token.shape_) # shape is capitalization and punctuation\n",
"\n",
"def print_sents(sents):\n",
" for sent in sents:\n",
" print(\"Sentence:\")\n",
" print(sent)\n",
" print()\n",
"\n",
"def parse(text):\n",
" tokens = parser(text)\n",
" print_sents(tokens.sents)\n",
" tokens_orth = [token.orth_ for token in tokens]\n",
" print(tokens_orth)\n",
" for token in tokens:\n",
" print_token(token)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"英語モデルをダウンロード。\n",
"\n",
"```\n",
"$ python -m spacy download en\n",
"\n",
" Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
"\n",
"Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)\n",
" 100% |████████████████████████████████| 52.2MB 411kB/s \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence:\n",
"I'm Mr. Cong.\n",
"\n",
"Sentence:\n",
"Dr. Duc is coming.\n",
"\n",
"Sentence:\n",
"Ph.D. Viet is the man overthere.\n",
"\n",
"['I', \"'m\", 'Mr.', 'Cong', '.', 'Dr.', 'Duc', 'is', 'coming', '.', 'Ph.D.', 'Viet', 'is', 'the', 'man', 'overthere', '.']\n",
"==========================\n",
"value: I\n",
"lemma: -PRON-\n",
"shape: X\n",
"==========================\n",
"value: 'm\n",
"lemma: be\n",
"shape: 'x\n",
"==========================\n",
"value: Mr.\n",
"lemma: mr.\n",
"shape: Xx.\n",
"==========================\n",
"value: Cong\n",
"lemma: cong\n",
"shape: Xxxx\n",
"==========================\n",
"value: .\n",
"lemma: .\n",
"shape: .\n",
"==========================\n",
"value: Dr.\n",
"lemma: dr.\n",
"shape: Xx.\n",
"==========================\n",
"value: Duc\n",
"lemma: duc\n",
"shape: Xxx\n",
"==========================\n",
"value: is\n",
"lemma: be\n",
"shape: xx\n",
"==========================\n",
"value: coming\n",
"lemma: come\n",
"shape: xxxx\n",
"==========================\n",
"value: .\n",
"lemma: .\n",
"shape: .\n",
"==========================\n",
"value: Ph.D.\n",
"lemma: ph.d.\n",
"shape: Xx.X.\n",
"==========================\n",
"value: Viet\n",
"lemma: viet\n",
"shape: Xxxx\n",
"==========================\n",
"value: is\n",
"lemma: be\n",
"shape: xx\n",
"==========================\n",
"value: the\n",
"lemma: the\n",
"shape: xxx\n",
"==========================\n",
"value: man\n",
"lemma: man\n",
"shape: xxx\n",
"==========================\n",
"value: overthere\n",
"lemma: overthere\n",
"shape: xxxx\n",
"==========================\n",
"value: .\n",
"lemma: .\n",
"shape: .\n"
]
}
],
"source": [
"parser = spacy.en.English()\n",
"parse(\"I'm Mr. Cong. Dr. Duc is coming. Ph.D. Viet is the man overthere.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"日本語モデルはあるのかな\n",
"\n",
"```\n",
"$ python -m spacy download ja\n",
"\n",
" Compatibility error\n",
"\n",
" No compatible model found for 'ja' (spaCy v1.8.2).\n",
"```\n",
"\n",
"まだないですね。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-47928386adc1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mja\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mJapanese\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mtokens_orth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0morth_\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens_orth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mprint_sents\u001b[0;34m(sents)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msents\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sentence:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/ubuntu/workspace/nlp-python/.env/lib/python3.4/site-packages/spacy/tokens/doc.pyx\u001b[0m in \u001b[0;36m__get__ (spacy/tokens/doc.cpp:10140)\u001b[0;34m()\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_parsed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 438\u001b[0m \u001b[0;34m\"Sentence boundary detection requires the dependency parse, which \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;34m\"requires data to be installed. For more info, see the \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n"
]
}
],
"source": [
"parser = spacy.ja.Japanese()\n",
"parse(\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"つまり、日本語モデルがないため、spaCyではドキュメントを文ごとに切ることができないのね。\n",
"言語モデルを追加する方法:\n",
"https://spacy.io/docs/usage/adding-languages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
jupyter notebook --port $PORT --ip $IP --no-browser
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment