vochicong · November 20, 2019 09:17
diff --git a/01-japanese-nlp-janome-spacy.ipynb b/01-japanese-nlp-janome-spacy.ipynb
diff --git a/02-stopwords-sentences.ipynb b/02-stopwords-sentences.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'a',\n",
       " 'about',\n",
       " 'above',\n",
       " 'across',\n",
       " 'after',\n",
       " 'afterwards',\n",
       " 'again',\n",
       " 'against',\n",
       " 'all',\n",
       " 'almost',\n",
       " 'alone',\n",
       " 'along',\n",
       " 'already',\n",
       " 'also',\n",
       " 'although',\n",
       " 'always',\n",
       " 'am',\n",
       " 'among',\n",
       " 'amongst',\n",
       " 'amount',\n",
       " 'an',\n",
       " 'and',\n",
       " 'another',\n",
       " 'any',\n",
       " 'anyhow',\n",
       " 'anyone',\n",
       " 'anything',\n",
       " 'anyway',\n",
       " 'anywhere',\n",
       " 'are',\n",
       " 'around',\n",
       " 'as',\n",
       " 'at',\n",
       " 'back',\n",
       " 'be',\n",
       " 'became',\n",
       " 'because',\n",
       " 'become',\n",
       " 'becomes',\n",
       " 'becoming',\n",
       " 'been',\n",
       " 'before',\n",
       " 'beforehand',\n",
       " 'behind',\n",
       " 'being',\n",
       " 'below',\n",
       " 'beside',\n",
       " 'besides',\n",
       " 'between',\n",
       " 'beyond',\n",
       " 'both',\n",
       " 'bottom',\n",
       " 'but',\n",
       " 'by',\n",
       " 'ca',\n",
       " 'call',\n",
       " 'can',\n",
       " 'cannot',\n",
       " 'could',\n",
       " 'did',\n",
       " 'do',\n",
       " 'does',\n",
       " 'doing',\n",
       " 'done',\n",
       " 'down',\n",
       " 'due',\n",
       " 'during',\n",
       " 'each',\n",
       " 'eight',\n",
       " 'either',\n",
       " 'eleven',\n",
       " 'else',\n",
       " 'elsewhere',\n",
       " 'empty',\n",
       " 'enough',\n",
       " 'etc',\n",
       " 'even',\n",
       " 'ever',\n",
       " 'every',\n",
       " 'everyone',\n",
       " 'everything',\n",
       " 'everywhere',\n",
       " 'except',\n",
       " 'few',\n",
       " 'fifteen',\n",
       " 'fifty',\n",
       " 'first',\n",
       " 'five',\n",
       " 'for',\n",
       " 'former',\n",
       " 'formerly',\n",
       " 'forty',\n",
       " 'four',\n",
       " 'from',\n",
       " 'front',\n",
       " 'full',\n",
       " 'further',\n",
       " 'get',\n",
       " 'give',\n",
       " 'go',\n",
       " 'had',\n",
       " 'has',\n",
       " 'have',\n",
       " 'he',\n",
       " 'hence',\n",
       " 'her',\n",
       " 'here',\n",
       " 'hereafter',\n",
       " 'hereby',\n",
       " 'herein',\n",
       " 'hereupon',\n",
       " 'hers',\n",
       " 'herself',\n",
       " 'him',\n",
       " 'himself',\n",
       " 'his',\n",
       " 'how',\n",
       " 'however',\n",
       " 'hundred',\n",
       " 'i',\n",
       " 'if',\n",
       " 'in',\n",
       " 'inc',\n",
       " 'indeed',\n",
       " 'into',\n",
       " 'is',\n",
       " 'it',\n",
       " 'its',\n",
       " 'itself',\n",
       " 'just',\n",
       " 'keep',\n",
       " 'last',\n",
       " 'latter',\n",
       " 'latterly',\n",
       " 'least',\n",
       " 'less',\n",
       " 'made',\n",
       " 'make',\n",
       " 'many',\n",
       " 'may',\n",
       " 'me',\n",
       " 'meanwhile',\n",
       " 'might',\n",
       " 'mine',\n",
       " 'more',\n",
       " 'moreover',\n",
       " 'most',\n",
       " 'mostly',\n",
       " 'move',\n",
       " 'much',\n",
       " 'must',\n",
       " 'my',\n",
       " 'myself',\n",
       " 'name',\n",
       " 'namely',\n",
       " 'neither',\n",
       " 'never',\n",
       " 'nevertheless',\n",
       " 'next',\n",
       " 'nine',\n",
       " 'no',\n",
       " 'nobody',\n",
       " 'none',\n",
       " 'noone',\n",
       " 'nor',\n",
       " 'not',\n",
       " 'nothing',\n",
       " 'now',\n",
       " 'nowhere',\n",
       " 'of',\n",
       " 'off',\n",
       " 'often',\n",
       " 'on',\n",
       " 'once',\n",
       " 'one',\n",
       " 'only',\n",
       " 'onto',\n",
       " 'or',\n",
       " 'other',\n",
       " 'others',\n",
       " 'otherwise',\n",
       " 'our',\n",
       " 'ours',\n",
       " 'ourselves',\n",
       " 'out',\n",
       " 'over',\n",
       " 'own',\n",
       " 'part',\n",
       " 'per',\n",
       " 'perhaps',\n",
       " 'please',\n",
       " 'put',\n",
       " 'quite',\n",
       " 'rather',\n",
       " 're',\n",
       " 'really',\n",
       " 'regarding',\n",
       " 'same',\n",
       " 'say',\n",
       " 'see',\n",
       " 'seem',\n",
       " 'seemed',\n",
       " 'seeming',\n",
       " 'seems',\n",
       " 'serious',\n",
       " 'several',\n",
       " 'she',\n",
       " 'should',\n",
       " 'show',\n",
       " 'side',\n",
       " 'since',\n",
       " 'six',\n",
       " 'sixty',\n",
       " 'so',\n",
       " 'some',\n",
       " 'somehow',\n",
       " 'someone',\n",
       " 'something',\n",
       " 'sometime',\n",
       " 'sometimes',\n",
       " 'somewhere',\n",
       " 'still',\n",
       " 'such',\n",
       " 'take',\n",
       " 'ten',\n",
       " 'than',\n",
       " 'that',\n",
       " 'the',\n",
       " 'their',\n",
       " 'them',\n",
       " 'themselves',\n",
       " 'then',\n",
       " 'thence',\n",
       " 'there',\n",
       " 'thereafter',\n",
       " 'thereby',\n",
       " 'therefore',\n",
       " 'therein',\n",
       " 'thereupon',\n",
       " 'these',\n",
       " 'they',\n",
       " 'third',\n",
       " 'this',\n",
       " 'those',\n",
       " 'though',\n",
       " 'three',\n",
       " 'through',\n",
       " 'throughout',\n",
       " 'thru',\n",
       " 'thus',\n",
       " 'to',\n",
       " 'together',\n",
       " 'too',\n",
       " 'top',\n",
       " 'toward',\n",
       " 'towards',\n",
       " 'twelve',\n",
       " 'twenty',\n",
       " 'two',\n",
       " 'under',\n",
       " 'unless',\n",
       " 'until',\n",
       " 'up',\n",
       " 'upon',\n",
       " 'us',\n",
       " 'used',\n",
       " 'using',\n",
       " 'various',\n",
       " 'very',\n",
       " 'via',\n",
       " 'was',\n",
       " 'we',\n",
       " 'well',\n",
       " 'were',\n",
       " 'what',\n",
       " 'whatever',\n",
       " 'when',\n",
       " 'whence',\n",
       " 'whenever',\n",
       " 'where',\n",
       " 'whereafter',\n",
       " 'whereas',\n",
       " 'whereby',\n",
       " 'wherein',\n",
       " 'whereupon',\n",
       " 'wherever',\n",
       " 'whether',\n",
       " 'which',\n",
       " 'while',\n",
       " 'whither',\n",
       " 'who',\n",
       " 'whoever',\n",
       " 'whole',\n",
       " 'whom',\n",
       " 'whose',\n",
       " 'why',\n",
       " 'will',\n",
       " 'with',\n",
       " 'within',\n",
       " 'without',\n",
       " 'would',\n",
       " 'yet',\n",
       " 'you',\n",
       " 'your',\n",
       " 'yours',\n",
       " 'yourself',\n",
       " 'yourselves'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spacy.en.STOP_WORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'、', '。'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spacy.ja.STOP_WORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_token(token):\n",
    "    print(\"==========================\")\n",
    "    print(\"value:\",token.orth_)\n",
    "    print(\"lemma:\",token.lemma_) # lemma is the root of a word\n",
    "    print(\"shape:\",token.shape_) # shape is capitalization and punctuation\n",
    "\n",
    "def print_sents(sents):\n",
    "    for sent in sents:\n",
    "        print(\"Sentence:\")\n",
    "        print(sent)\n",
    "        print()\n",
    "\n",
    "def parse(text):\n",
    "    tokens = parser(text)\n",
    "    print_sents(tokens.sents)\n",
    "    tokens_orth = [token.orth_ for token in tokens]\n",
    "    print(tokens_orth)\n",
    "    for token in tokens:\n",
    "        print_token(token)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "英語モデルをダウンロード。\n",
    "\n",
    "```\n",
    "$ python -m spacy download en\n",
    "\n",
    "    Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
    "\n",
    "Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
    "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)\n",
    "    100% |████████████████████████████████| 52.2MB 411kB/s \n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence:\n",
      "I'm Mr. Cong.\n",
      "\n",
      "Sentence:\n",
      "Dr. Duc is coming.\n",
      "\n",
      "Sentence:\n",
      "Ph.D. Viet is the man overthere.\n",
      "\n",
      "['I', \"'m\", 'Mr.', 'Cong', '.', 'Dr.', 'Duc', 'is', 'coming', '.', 'Ph.D.', 'Viet', 'is', 'the', 'man', 'overthere', '.']\n",
      "==========================\n",
      "value: I\n",
      "lemma: -PRON-\n",
      "shape: X\n",
      "==========================\n",
      "value: 'm\n",
      "lemma: be\n",
      "shape: 'x\n",
      "==========================\n",
      "value: Mr.\n",
      "lemma: mr.\n",
      "shape: Xx.\n",
      "==========================\n",
      "value: Cong\n",
      "lemma: cong\n",
      "shape: Xxxx\n",
      "==========================\n",
      "value: .\n",
      "lemma: .\n",
      "shape: .\n",
      "==========================\n",
      "value: Dr.\n",
      "lemma: dr.\n",
      "shape: Xx.\n",
      "==========================\n",
      "value: Duc\n",
      "lemma: duc\n",
      "shape: Xxx\n",
      "==========================\n",
      "value: is\n",
      "lemma: be\n",
      "shape: xx\n",
      "==========================\n",
      "value: coming\n",
      "lemma: come\n",
      "shape: xxxx\n",
      "==========================\n",
      "value: .\n",
      "lemma: .\n",
      "shape: .\n",
      "==========================\n",
      "value: Ph.D.\n",
      "lemma: ph.d.\n",
      "shape: Xx.X.\n",
      "==========================\n",
      "value: Viet\n",
      "lemma: viet\n",
      "shape: Xxxx\n",
      "==========================\n",
      "value: is\n",
      "lemma: be\n",
      "shape: xx\n",
      "==========================\n",
      "value: the\n",
      "lemma: the\n",
      "shape: xxx\n",
      "==========================\n",
      "value: man\n",
      "lemma: man\n",
      "shape: xxx\n",
      "==========================\n",
      "value: overthere\n",
      "lemma: overthere\n",
      "shape: xxxx\n",
      "==========================\n",
      "value: .\n",
      "lemma: .\n",
      "shape: .\n"
     ]
    }
   ],
   "source": [
    "parser = spacy.en.English()\n",
    "parse(\"I'm Mr. Cong. Dr. Duc is coming. Ph.D. Viet is the man overthere.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "日本語モデルはあるのかな\n",
    "\n",
    "```\n",
    "$ python -m spacy download ja\n",
    "\n",
    "    Compatibility error\n",
    "\n",
    "    No compatible model found for 'ja' (spaCy v1.8.2).\n",
    "```\n",
    "\n",
    "まだないですね。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-47928386adc1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mja\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mJapanese\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m     \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m     \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m     \u001b[0mtokens_orth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0morth_\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens_orth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mprint_sents\u001b[0;34m(sents)\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msents\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sentence:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/ubuntu/workspace/nlp-python/.env/lib/python3.4/site-packages/spacy/tokens/doc.pyx\u001b[0m in \u001b[0;36m__get__ (spacy/tokens/doc.cpp:10140)\u001b[0;34m()\u001b[0m\n\u001b[1;32m    435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    436\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_parsed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m                 raise ValueError(\n\u001b[0m\u001b[1;32m    438\u001b[0m                     \u001b[0;34m\"Sentence boundary detection requires the dependency parse, which \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    439\u001b[0m                     \u001b[0;34m\"requires data to be installed. For more info, see the \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n"
     ]
    }
   ],
   "source": [
    "parser = spacy.ja.Japanese()\n",
    "parse(\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "つまり、日本語モデルがないため、spaCyではドキュメントを文ごとに切ることができないのね。\n",
    "言語モデルを追加する方法：\n",
    "https://spacy.io/docs/usage/adding-languages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/99-start-jupyter-cloud9.sh b/99-start-jupyter-cloud9.sh
 jupyter notebook --port $PORT --ip $IP --no-browser
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import spacy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'a',\n",
	" 'about',\n",
	" 'above',\n",
	" 'across',\n",
	" 'after',\n",
	" 'afterwards',\n",
	" 'again',\n",
	" 'against',\n",
	" 'all',\n",
	" 'almost',\n",
	" 'alone',\n",
	" 'along',\n",
	" 'already',\n",
	" 'also',\n",
	" 'although',\n",
	" 'always',\n",
	" 'am',\n",
	" 'among',\n",
	" 'amongst',\n",
	" 'amount',\n",
	" 'an',\n",
	" 'and',\n",
	" 'another',\n",
	" 'any',\n",
	" 'anyhow',\n",
	" 'anyone',\n",
	" 'anything',\n",
	" 'anyway',\n",
	" 'anywhere',\n",
	" 'are',\n",
	" 'around',\n",
	" 'as',\n",
	" 'at',\n",
	" 'back',\n",
	" 'be',\n",
	" 'became',\n",
	" 'because',\n",
	" 'become',\n",
	" 'becomes',\n",
	" 'becoming',\n",
	" 'been',\n",
	" 'before',\n",
	" 'beforehand',\n",
	" 'behind',\n",
	" 'being',\n",
	" 'below',\n",
	" 'beside',\n",
	" 'besides',\n",
	" 'between',\n",
	" 'beyond',\n",
	" 'both',\n",
	" 'bottom',\n",
	" 'but',\n",
	" 'by',\n",
	" 'ca',\n",
	" 'call',\n",
	" 'can',\n",
	" 'cannot',\n",
	" 'could',\n",
	" 'did',\n",
	" 'do',\n",
	" 'does',\n",
	" 'doing',\n",
	" 'done',\n",
	" 'down',\n",
	" 'due',\n",
	" 'during',\n",
	" 'each',\n",
	" 'eight',\n",
	" 'either',\n",
	" 'eleven',\n",
	" 'else',\n",
	" 'elsewhere',\n",
	" 'empty',\n",
	" 'enough',\n",
	" 'etc',\n",
	" 'even',\n",
	" 'ever',\n",
	" 'every',\n",
	" 'everyone',\n",
	" 'everything',\n",
	" 'everywhere',\n",
	" 'except',\n",
	" 'few',\n",
	" 'fifteen',\n",
	" 'fifty',\n",
	" 'first',\n",
	" 'five',\n",
	" 'for',\n",
	" 'former',\n",
	" 'formerly',\n",
	" 'forty',\n",
	" 'four',\n",
	" 'from',\n",
	" 'front',\n",
	" 'full',\n",
	" 'further',\n",
	" 'get',\n",
	" 'give',\n",
	" 'go',\n",
	" 'had',\n",
	" 'has',\n",
	" 'have',\n",
	" 'he',\n",
	" 'hence',\n",
	" 'her',\n",
	" 'here',\n",
	" 'hereafter',\n",
	" 'hereby',\n",
	" 'herein',\n",
	" 'hereupon',\n",
	" 'hers',\n",
	" 'herself',\n",
	" 'him',\n",
	" 'himself',\n",
	" 'his',\n",
	" 'how',\n",
	" 'however',\n",
	" 'hundred',\n",
	" 'i',\n",
	" 'if',\n",
	" 'in',\n",
	" 'inc',\n",
	" 'indeed',\n",
	" 'into',\n",
	" 'is',\n",
	" 'it',\n",
	" 'its',\n",
	" 'itself',\n",
	" 'just',\n",
	" 'keep',\n",
	" 'last',\n",
	" 'latter',\n",
	" 'latterly',\n",
	" 'least',\n",
	" 'less',\n",
	" 'made',\n",
	" 'make',\n",
	" 'many',\n",
	" 'may',\n",
	" 'me',\n",
	" 'meanwhile',\n",
	" 'might',\n",
	" 'mine',\n",
	" 'more',\n",
	" 'moreover',\n",
	" 'most',\n",
	" 'mostly',\n",
	" 'move',\n",
	" 'much',\n",
	" 'must',\n",
	" 'my',\n",
	" 'myself',\n",
	" 'name',\n",
	" 'namely',\n",
	" 'neither',\n",
	" 'never',\n",
	" 'nevertheless',\n",
	" 'next',\n",
	" 'nine',\n",
	" 'no',\n",
	" 'nobody',\n",
	" 'none',\n",
	" 'noone',\n",
	" 'nor',\n",
	" 'not',\n",
	" 'nothing',\n",
	" 'now',\n",
	" 'nowhere',\n",
	" 'of',\n",
	" 'off',\n",
	" 'often',\n",
	" 'on',\n",
	" 'once',\n",
	" 'one',\n",
	" 'only',\n",
	" 'onto',\n",
	" 'or',\n",
	" 'other',\n",
	" 'others',\n",
	" 'otherwise',\n",
	" 'our',\n",
	" 'ours',\n",
	" 'ourselves',\n",
	" 'out',\n",
	" 'over',\n",
	" 'own',\n",
	" 'part',\n",
	" 'per',\n",
	" 'perhaps',\n",
	" 'please',\n",
	" 'put',\n",
	" 'quite',\n",
	" 'rather',\n",
	" 're',\n",
	" 'really',\n",
	" 'regarding',\n",
	" 'same',\n",
	" 'say',\n",
	" 'see',\n",
	" 'seem',\n",
	" 'seemed',\n",
	" 'seeming',\n",
	" 'seems',\n",
	" 'serious',\n",
	" 'several',\n",
	" 'she',\n",
	" 'should',\n",
	" 'show',\n",
	" 'side',\n",
	" 'since',\n",
	" 'six',\n",
	" 'sixty',\n",
	" 'so',\n",
	" 'some',\n",
	" 'somehow',\n",
	" 'someone',\n",
	" 'something',\n",
	" 'sometime',\n",
	" 'sometimes',\n",
	" 'somewhere',\n",
	" 'still',\n",
	" 'such',\n",
	" 'take',\n",
	" 'ten',\n",
	" 'than',\n",
	" 'that',\n",
	" 'the',\n",
	" 'their',\n",
	" 'them',\n",
	" 'themselves',\n",
	" 'then',\n",
	" 'thence',\n",
	" 'there',\n",
	" 'thereafter',\n",
	" 'thereby',\n",
	" 'therefore',\n",
	" 'therein',\n",
	" 'thereupon',\n",
	" 'these',\n",
	" 'they',\n",
	" 'third',\n",
	" 'this',\n",
	" 'those',\n",
	" 'though',\n",
	" 'three',\n",
	" 'through',\n",
	" 'throughout',\n",
	" 'thru',\n",
	" 'thus',\n",
	" 'to',\n",
	" 'together',\n",
	" 'too',\n",
	" 'top',\n",
	" 'toward',\n",
	" 'towards',\n",
	" 'twelve',\n",
	" 'twenty',\n",
	" 'two',\n",
	" 'under',\n",
	" 'unless',\n",
	" 'until',\n",
	" 'up',\n",
	" 'upon',\n",
	" 'us',\n",
	" 'used',\n",
	" 'using',\n",
	" 'various',\n",
	" 'very',\n",
	" 'via',\n",
	" 'was',\n",
	" 'we',\n",
	" 'well',\n",
	" 'were',\n",
	" 'what',\n",
	" 'whatever',\n",
	" 'when',\n",
	" 'whence',\n",
	" 'whenever',\n",
	" 'where',\n",
	" 'whereafter',\n",
	" 'whereas',\n",
	" 'whereby',\n",
	" 'wherein',\n",
	" 'whereupon',\n",
	" 'wherever',\n",
	" 'whether',\n",
	" 'which',\n",
	" 'while',\n",
	" 'whither',\n",
	" 'who',\n",
	" 'whoever',\n",
	" 'whole',\n",
	" 'whom',\n",
	" 'whose',\n",
	" 'why',\n",
	" 'will',\n",
	" 'with',\n",
	" 'within',\n",
	" 'without',\n",
	" 'would',\n",
	" 'yet',\n",
	" 'you',\n",
	" 'your',\n",
	" 'yours',\n",
	" 'yourself',\n",
	" 'yourselves'}"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"spacy.en.STOP_WORDS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'、', '。'}"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"spacy.ja.STOP_WORDS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def print_token(token):\n",
	" print(\"==========================\")\n",
	" print(\"value:\",token.orth_)\n",
	" print(\"lemma:\",token.lemma_) # lemma is the root of a word\n",
	" print(\"shape:\",token.shape_) # shape is capitalization and punctuation\n",
	"\n",
	"def print_sents(sents):\n",
	" for sent in sents:\n",
	" print(\"Sentence:\")\n",
	" print(sent)\n",
	" print()\n",
	"\n",
	"def parse(text):\n",
	" tokens = parser(text)\n",
	" print_sents(tokens.sents)\n",
	" tokens_orth = [token.orth_ for token in tokens]\n",
	" print(tokens_orth)\n",
	" for token in tokens:\n",
	" print_token(token)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"英語モデルをダウンロード。\n",
	"\n",
	"```\n",
	"$ python -m spacy download en\n",
	"\n",
	" Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
	"\n",
	"Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz\n",
	" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)\n",
	" 100% \|████████████████████████████████\| 52.2MB 411kB/s \n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Sentence:\n",
	"I'm Mr. Cong.\n",
	"\n",
	"Sentence:\n",
	"Dr. Duc is coming.\n",
	"\n",
	"Sentence:\n",
	"Ph.D. Viet is the man overthere.\n",
	"\n",
	"['I', \"'m\", 'Mr.', 'Cong', '.', 'Dr.', 'Duc', 'is', 'coming', '.', 'Ph.D.', 'Viet', 'is', 'the', 'man', 'overthere', '.']\n",
	"==========================\n",
	"value: I\n",
	"lemma: -PRON-\n",
	"shape: X\n",
	"==========================\n",
	"value: 'm\n",
	"lemma: be\n",
	"shape: 'x\n",
	"==========================\n",
	"value: Mr.\n",
	"lemma: mr.\n",
	"shape: Xx.\n",
	"==========================\n",
	"value: Cong\n",
	"lemma: cong\n",
	"shape: Xxxx\n",
	"==========================\n",
	"value: .\n",
	"lemma: .\n",
	"shape: .\n",
	"==========================\n",
	"value: Dr.\n",
	"lemma: dr.\n",
	"shape: Xx.\n",
	"==========================\n",
	"value: Duc\n",
	"lemma: duc\n",
	"shape: Xxx\n",
	"==========================\n",
	"value: is\n",
	"lemma: be\n",
	"shape: xx\n",
	"==========================\n",
	"value: coming\n",
	"lemma: come\n",
	"shape: xxxx\n",
	"==========================\n",
	"value: .\n",
	"lemma: .\n",
	"shape: .\n",
	"==========================\n",
	"value: Ph.D.\n",
	"lemma: ph.d.\n",
	"shape: Xx.X.\n",
	"==========================\n",
	"value: Viet\n",
	"lemma: viet\n",
	"shape: Xxxx\n",
	"==========================\n",
	"value: is\n",
	"lemma: be\n",
	"shape: xx\n",
	"==========================\n",
	"value: the\n",
	"lemma: the\n",
	"shape: xxx\n",
	"==========================\n",
	"value: man\n",
	"lemma: man\n",
	"shape: xxx\n",
	"==========================\n",
	"value: overthere\n",
	"lemma: overthere\n",
	"shape: xxxx\n",
	"==========================\n",
	"value: .\n",
	"lemma: .\n",
	"shape: .\n"
	]
	}
	],
	"source": [
	"parser = spacy.en.English()\n",
	"parse(\"I'm Mr. Cong. Dr. Duc is coming. Ph.D. Viet is the man overthere.\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"日本語モデルはあるのかな\n",
	"\n",
	"```\n",
	"$ python -m spacy download ja\n",
	"\n",
	" Compatibility error\n",
	"\n",
	" No compatible model found for 'ja' (spaCy v1.8.2).\n",
	"```\n",
	"\n",
	"まだないですね。"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"ename": "ValueError",
	"evalue": "Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-6-47928386adc1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mja\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mJapanese\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mtokens_orth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0morth_\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens_orth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m<ipython-input-4-04a8f5f4066a>\u001b[0m in \u001b[0;36mprint_sents\u001b[0;34m(sents)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mprint_sents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msents\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sentence:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/home/ubuntu/workspace/nlp-python/.env/lib/python3.4/site-packages/spacy/tokens/doc.pyx\u001b[0m in \u001b[0;36m__get__ (spacy/tokens/doc.cpp:10140)\u001b[0;34m()\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_parsed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 437\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 438\u001b[0m \u001b[0;34m\"Sentence boundary detection requires the dependency parse, which \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;34m\"requires data to be installed. For more info, see the \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mValueError\u001b[0m: Sentence boundary detection requires the dependency parse, which requires data to be installed. For more info, see the documentation: \nhttps://spacy.io/docs/usage\n"
	]
	}
	],
	"source": [
	"parser = spacy.ja.Japanese()\n",
	"parse(\"こんいちは。私はコンといいます。ベト博士はあちらにいます。\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"つまり、日本語モデルがないため、spaCyではドキュメントを文ごとに切ることができないのね。\n",
	"言語モデルを追加する方法：\n",
	"https://spacy.io/docs/usage/adding-languages"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}