Skip to content

Instantly share code, notes, and snippets.

@d2207197
Last active August 29, 2015 14:06
Show Gist options
  • Save d2207197/55ccc3532b5c5b609dde to your computer and use it in GitHub Desktop.
Save d2207197/55ccc3532b5c5b609dde to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:d7a1109bb79e4282b03b19220e9ff24f508de499b4b479c97bf4bdad1c99a173"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from itertools import chain, imap, islice, izip\n",
"from collections import Counter\n",
"import re\n",
"from functools import partial"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 98
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# big_text = open('big.txt').read() # \u4e0d\u5207\n",
"big_text_paras = open('big.txt').read().split('\\n\\n') # \u5207\u6bb5\u843d\n",
"# big_text_lines = open('big.txt').readlines() # \u5207\u884c"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 240
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# \u8a08\u7b97 1-5 gram \u6b21\u6578\n",
"def count_ngrams(big_text_paras):\n",
" def to_unigrams( text ): \n",
" return re.findall('[a-z]+', text.lower())\n",
" def to_ngrams( unigrams, length):\n",
" return zip(*[unigrams[i:] for i in range(length)])\n",
"\n",
" unigrams_paras = map(to_unigrams, big_text_paras)\n",
" ngram_counts = {}\n",
" for n in range(1, 6):\n",
" ngram_counts[n] = Counter(chain(*map(partial(to_ngrams, length = n),unigrams_paras)))\n",
" return ngram_counts"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 281
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%time ngram_counts_paras = count_ngrams(big_text_paras) # \u57f7\u884c\u6642\u9593 1.45 sec"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CPU times: user 1.42 s, sys: 84.3 ms, total: 1.51 s\n",
"Wall time: 1.45 s\n"
]
}
],
"prompt_number": 282
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# \u5370\u51fa\u7d50\u679c\n",
"\n",
"print 'splited by paragraph '\n",
"for n in ngram_counts_paras:\n",
" print ' {}gram: '.format(n),\n",
" print ', '.join(['\"{}\": {}'.format(\n",
" ' '.join(ngram), count) \n",
" for ngram, count in ngram_counts_paras[n].most_common(5) ])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"splited by paragraph \n",
" 1gram: \"the\": 80030, \"of\": 40025, \"and\": 38313, \"to\": 28766, \"in\": 22050\n",
" 2gram: "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\"of the\": 12561, \"in the\": 6543, \"to the\": 4466, \"and the\": 3226, \"on the\": 2519\n",
" 3gram: "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\"the united states\": 460, \"one of the\": 380, \"out of the\": 253, \"of the united\": 241, \"he did not\": 239\n",
" 4gram: "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\"of the united states\": 239, \"at the same time\": 128, \"as a result of\": 104, \"the commander in chief\": 103, \"for a long time\": 91\n",
" 5gram: "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\"history of the united states\": 60, \"in the region of the\": 40, \"in the middle of the\": 38, \"project gutenberg literary archive foundation\": 36, \"the project gutenberg literary archive\": 32\n"
]
}
],
"prompt_number": 127
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!gist nlp\\ lab1.ipynb --update 55ccc3532b5c5b609dde"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"https://gist.github.com/55ccc3532b5c5b609dde\r\n"
]
}
],
"prompt_number": 119
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment