d2207197 · August 29, 2015 14:06
diff --git a/nlp lab1.ipynb b/nlp lab1.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:d7a1109bb79e4282b03b19220e9ff24f508de499b4b479c97bf4bdad1c99a173"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from itertools import chain, imap, islice, izip\n",
      "from collections import Counter\n",
      "import re\n",
      "from functools import partial"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 98
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# big_text = open('big.txt').read() # \u4e0d\u5207\n",
      "big_text_paras = open('big.txt').read().split('\\n\\n') # \u5207\u6bb5\u843d\n",
      "# big_text_lines = open('big.txt').readlines() # \u5207\u884c"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 240
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# \u8a08\u7b97 1-5 gram \u6b21\u6578\n",
      "def count_ngrams(big_text_paras):\n",
      "    def to_unigrams( text ): \n",
      "        return re.findall('[a-z]+', text.lower())\n",
      "    def to_ngrams( unigrams, length):\n",
      "        return zip(*[unigrams[i:] for i in range(length)])\n",
      "\n",
      "    unigrams_paras = map(to_unigrams, big_text_paras)\n",
      "    ngram_counts = {}\n",
      "    for n in range(1, 6):\n",
      "        ngram_counts[n] = Counter(chain(*map(partial(to_ngrams, length = n),unigrams_paras)))\n",
      "    return ngram_counts"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 281
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%time ngram_counts_paras = count_ngrams(big_text_paras) # \u57f7\u884c\u6642\u9593 1.45 sec"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CPU times: user 1.42 s, sys: 84.3 ms, total: 1.51 s\n",
        "Wall time: 1.45 s\n"
       ]
      }
     ],
     "prompt_number": 282
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# \u5370\u51fa\u7d50\u679c\n",
      "\n",
      "print 'splited by paragraph  '\n",
      "for n in ngram_counts_paras:\n",
      "    print '  {}gram: '.format(n),\n",
      "    print ', '.join(['\"{}\": {}'.format(\n",
      "        ' '.join(ngram), count) \n",
      "            for ngram, count in ngram_counts_paras[n].most_common(5) ])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "splited by paragraph  \n",
        "  1gram:  \"the\": 80030, \"of\": 40025, \"and\": 38313, \"to\": 28766, \"in\": 22050\n",
        "  2gram:  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\"of the\": 12561, \"in the\": 6543, \"to the\": 4466, \"and the\": 3226, \"on the\": 2519\n",
        "  3gram:  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\"the united states\": 460, \"one of the\": 380, \"out of the\": 253, \"of the united\": 241, \"he did not\": 239\n",
        "  4gram:  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\"of the united states\": 239, \"at the same time\": 128, \"as a result of\": 104, \"the commander in chief\": 103, \"for a long time\": 91\n",
        "  5gram:  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\"history of the united states\": 60, \"in the region of the\": 40, \"in the middle of the\": 38, \"project gutenberg literary archive foundation\": 36, \"the project gutenberg literary archive\": 32\n"
       ]
      }
     ],
     "prompt_number": 127
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!gist nlp\\ lab1.ipynb --update 55ccc3532b5c5b609dde"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "https://gist.github.com/55ccc3532b5c5b609dde\r\n"
       ]
      }
     ],
     "prompt_number": 119
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "",
	"signature": "sha256:d7a1109bb79e4282b03b19220e9ff24f508de499b4b479c97bf4bdad1c99a173"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from itertools import chain, imap, islice, izip\n",
	"from collections import Counter\n",
	"import re\n",
	"from functools import partial"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 98
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# big_text = open('big.txt').read() # \u4e0d\u5207\n",
	"big_text_paras = open('big.txt').read().split('\\n\\n') # \u5207\u6bb5\u843d\n",
	"# big_text_lines = open('big.txt').readlines() # \u5207\u884c"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 240
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# \u8a08\u7b97 1-5 gram \u6b21\u6578\n",
	"def count_ngrams(big_text_paras):\n",
	" def to_unigrams( text ): \n",
	" return re.findall('[a-z]+', text.lower())\n",
	" def to_ngrams( unigrams, length):\n",
	" return zip(*[unigrams[i:] for i in range(length)])\n",
	"\n",
	" unigrams_paras = map(to_unigrams, big_text_paras)\n",
	" ngram_counts = {}\n",
	" for n in range(1, 6):\n",
	" ngram_counts[n] = Counter(chain(*map(partial(to_ngrams, length = n),unigrams_paras)))\n",
	" return ngram_counts"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 281
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%time ngram_counts_paras = count_ngrams(big_text_paras) # \u57f7\u884c\u6642\u9593 1.45 sec"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"CPU times: user 1.42 s, sys: 84.3 ms, total: 1.51 s\n",
	"Wall time: 1.45 s\n"
	]
	}
	],
	"prompt_number": 282
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# \u5370\u51fa\u7d50\u679c\n",
	"\n",
	"print 'splited by paragraph '\n",
	"for n in ngram_counts_paras:\n",
	" print ' {}gram: '.format(n),\n",
	" print ', '.join(['\"{}\": {}'.format(\n",
	" ' '.join(ngram), count) \n",
	" for ngram, count in ngram_counts_paras[n].most_common(5) ])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"splited by paragraph \n",
	" 1gram: \"the\": 80030, \"of\": 40025, \"and\": 38313, \"to\": 28766, \"in\": 22050\n",
	" 2gram: "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\"of the\": 12561, \"in the\": 6543, \"to the\": 4466, \"and the\": 3226, \"on the\": 2519\n",
	" 3gram: "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\"the united states\": 460, \"one of the\": 380, \"out of the\": 253, \"of the united\": 241, \"he did not\": 239\n",
	" 4gram: "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\"of the united states\": 239, \"at the same time\": 128, \"as a result of\": 104, \"the commander in chief\": 103, \"for a long time\": 91\n",
	" 5gram: "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\"history of the united states\": 60, \"in the region of the\": 40, \"in the middle of the\": 38, \"project gutenberg literary archive foundation\": 36, \"the project gutenberg literary archive\": 32\n"
	]
	}
	],
	"prompt_number": 127
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"!gist nlp\\ lab1.ipynb --update 55ccc3532b5c5b609dde"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"https://gist.github.com/55ccc3532b5c5b609dde\r\n"
	]
	}
	],
	"prompt_number": 119
	}
	],
	"metadata": {}
	}
	]
	}