Created
August 7, 2015 07:10
-
-
Save zzl0/0e65f312de93add38f21 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:7aa6fc033293c44fb5fb0452f25dc363bab07b6d7e21558f99cae7310b5b21d0" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## \u521b\u5efa DparkContext\n", | |
"\n", | |
"\u8fd9\u91cc\u7684\u8fd9\u4e9b\u53c2\u6570\u5927\u5bb6\u4e0d\u7528\u5173\u5fc3\uff0c\u8fd9\u4e9b\u662f IPython Notebook \u542f\u52a8\u7684\u53c2\u6570\u3002" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from dpark import optParser\n", | |
"\n", | |
"optParser.add_option('-f')\n", | |
"optParser.add_option('--IPKernelApp.parent_appname')\n", | |
"optParser.add_option('--profile-dir')\n", | |
"optParser.add_option('--parent')\n", | |
"optParser.add_option('--pylab')\n", | |
"\n", | |
"from dpark import DparkContext\n", | |
"dc = DparkContext(\"mesos\")\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Demo -- Word Count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"path = '/home2/zhuzhaolong/testdata/shakespeare.txt'\n", | |
"rdd = dc.textFile(path, splitSize=2<<20)\n", | |
"import os\n", | |
"os.path.getsize(path)\n", | |
"rdd.take(10)\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"[\"A MIDSUMMER-NIGHT'S DREAM\",\n", | |
" '',\n", | |
" 'Now , fair Hippolyta , our nuptial hour ',\n", | |
" 'Draws on apace : four happy days bring in ',\n", | |
" 'Another moon ; but O ! methinks how slow ',\n", | |
" 'This old moon wanes ; she lingers my desires ,',\n", | |
" 'Like to a step dame , or a dowager ',\n", | |
" \"Long withering out a young man's revenue .\",\n", | |
" '',\n", | |
" 'Four days will quickly steep themselves in night ;']" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"rdd.flatMap(lambda x: x.split()).take(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"['A',\n", | |
" \"MIDSUMMER-NIGHT'S\",\n", | |
" 'DREAM',\n", | |
" 'Now',\n", | |
" ',',\n", | |
" 'fair',\n", | |
" 'Hippolyta',\n", | |
" ',',\n", | |
" 'our',\n", | |
" 'nuptial']" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"wc = (rdd.flatMap(lambda x: x.split())\n", | |
" .map(lambda x: (x, 1))\n", | |
" .reduceByKey(lambda x, y: x + y)\n", | |
" .top(10, key=lambda x: x[1]))\n", | |
"print wc" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"2015-08-07 14:17:22,021 [INFO] [dpark.schedule] Got job 1 with 3 tasks: <MappedRDD <FlatMappedRDD <TextFileRDD /home2/zhuzhaolong/testdata/shakespeare.txt>>>\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"2015-08-07 14:17:24,872 [INFO] [dpark.job] Job 1 finished in 2.9s: min=0.3s, avg=1.3s, max=2.1s, maxtry=1\n", | |
"2015-08-07 14:17:24,872 [INFO] [dpark.job] read 2.4MB (0% localized)\n", | |
"2015-08-07 14:17:24,962 [INFO] [dpark.schedule] Got job 2 with 3 tasks: <ShuffledRDD <MappedRDD <FlatMappedRDD <TextFileRDD /home2/zhuzhaolong/testdata/shakespeare.txt>>>>\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[(',', 81827), ('.', 36514), ('the', 23272), ('I', 20041), (';', 17274), ('and', 16817), ('to', 15506), ('of', 15037), ('you', 12361), ('a', 12155)]\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"[khand1] 2015-08-07 14:17:26,060 [WARNING] [dpark.executor@khand1] cwd (/home/zhuzhaolong) not exists\n", | |
"2015-08-07 14:17:26,442 [INFO] [dpark.job] Job 2 finished in 1.5s: min=0.0s, avg=0.1s, max=0.1s, maxtry=1\n", | |
"2015-08-07 14:17:26,442 [INFO] [dpark.job] read 2.1MB (0% localized)\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dir(rdd)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"['DEFAULT_SPLIT_SIZE',\n", | |
" '__class__',\n", | |
" '__delattr__',\n", | |
" '__dict__',\n", | |
" '__doc__',\n", | |
" '__format__',\n", | |
" '__getattribute__',\n", | |
" '__getslice__',\n", | |
" '__getstate__',\n", | |
" '__hash__',\n", | |
" '__init__',\n", | |
" '__iter__',\n", | |
" '__len__',\n", | |
" '__module__',\n", | |
" '__new__',\n", | |
" '__reduce__',\n", | |
" '__reduce_ex__',\n", | |
" '__repr__',\n", | |
" '__setattr__',\n", | |
" '__sizeof__',\n", | |
" '__str__',\n", | |
" '__subclasshook__',\n", | |
" '__weakref__',\n", | |
" '_join',\n", | |
" '_partitioner',\n", | |
" '_preferredLocations',\n", | |
" '_splits',\n", | |
" 'adcount',\n", | |
" 'adcountByKey',\n", | |
" 'aggregate',\n", | |
" 'aggregator',\n", | |
" 'asTable',\n", | |
" 'batch',\n", | |
" 'cache',\n", | |
" 'cartesian',\n", | |
" 'cogroup',\n", | |
" 'collect',\n", | |
" 'collectAsMap',\n", | |
" 'combineByKey',\n", | |
" 'compute',\n", | |
" 'count',\n", | |
" 'ctx',\n", | |
" 'dependencies',\n", | |
" 'enumerate',\n", | |
" 'enumeratePartition',\n", | |
" 'err',\n", | |
" 'fileinfo',\n", | |
" 'filter',\n", | |
" 'first',\n", | |
" 'flatMap',\n", | |
" 'flatMapValue',\n", | |
" 'fold',\n", | |
" 'foreach',\n", | |
" 'foreachPartition',\n", | |
" 'fromCsv',\n", | |
" 'glom',\n", | |
" 'groupBy',\n", | |
" 'groupByKey',\n", | |
" 'groupWith',\n", | |
" 'hot',\n", | |
" 'id',\n", | |
" 'innerJoin',\n", | |
" 'iterator',\n", | |
" 'join',\n", | |
" 'leftOuterJoin',\n", | |
" 'len',\n", | |
" 'lookup',\n", | |
" 'map',\n", | |
" 'mapPartition',\n", | |
" 'mapPartitions',\n", | |
" 'mapValue',\n", | |
" 'mem',\n", | |
" 'mergeSplit',\n", | |
" 'newId',\n", | |
" 'nextId',\n", | |
" 'open_file',\n", | |
" 'outerJoin',\n", | |
" 'partitionByKey',\n", | |
" 'partitioner',\n", | |
" 'path',\n", | |
" 'pipe',\n", | |
" 'preferredLocations',\n", | |
" 'read',\n", | |
" 'reduce',\n", | |
" 'reduceByKey',\n", | |
" 'reduceByKeyToDriver',\n", | |
" 'rightOuterJoin',\n", | |
" 'sample',\n", | |
" 'saveAsBeansdb',\n", | |
" 'saveAsBinaryFile',\n", | |
" 'saveAsCSVFile',\n", | |
" 'saveAsTableFile',\n", | |
" 'saveAsTabular',\n", | |
" 'saveAsTextFile',\n", | |
" 'saveAsTextFileByKey',\n", | |
" 'shouldCache',\n", | |
" 'size',\n", | |
" 'snapshot',\n", | |
" 'snapshot_path',\n", | |
" 'sort',\n", | |
" 'splitSize',\n", | |
" 'splits',\n", | |
" 'take',\n", | |
" 'toList',\n", | |
" 'top',\n", | |
" 'union',\n", | |
" 'uniq',\n", | |
" 'update',\n", | |
" 'zipWith']" | |
] | |
} | |
], | |
"prompt_number": 11 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment