Skip to content

Instantly share code, notes, and snippets.

@zzl0
Created August 7, 2015 07:10
Show Gist options
  • Save zzl0/0e65f312de93add38f21 to your computer and use it in GitHub Desktop.
Save zzl0/0e65f312de93add38f21 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:7aa6fc033293c44fb5fb0452f25dc363bab07b6d7e21558f99cae7310b5b21d0"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## \u521b\u5efa DparkContext\n",
"\n",
"\u8fd9\u91cc\u7684\u8fd9\u4e9b\u53c2\u6570\u5927\u5bb6\u4e0d\u7528\u5173\u5fc3\uff0c\u8fd9\u4e9b\u662f IPython Notebook \u542f\u52a8\u7684\u53c2\u6570\u3002"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from dpark import optParser\n",
"\n",
"optParser.add_option('-f')\n",
"optParser.add_option('--IPKernelApp.parent_appname')\n",
"optParser.add_option('--profile-dir')\n",
"optParser.add_option('--parent')\n",
"optParser.add_option('--pylab')\n",
"\n",
"from dpark import DparkContext\n",
"dc = DparkContext(\"mesos\")\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Demo -- Word Count"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"path = '/home2/zhuzhaolong/testdata/shakespeare.txt'\n",
"rdd = dc.textFile(path, splitSize=2<<20)\n",
"import os\n",
"os.path.getsize(path)\n",
"rdd.take(10)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"[\"A MIDSUMMER-NIGHT'S DREAM\",\n",
" '',\n",
" 'Now , fair Hippolyta , our nuptial hour ',\n",
" 'Draws on apace : four happy days bring in ',\n",
" 'Another moon ; but O ! methinks how slow ',\n",
" 'This old moon wanes ; she lingers my desires ,',\n",
" 'Like to a step dame , or a dowager ',\n",
" \"Long withering out a young man's revenue .\",\n",
" '',\n",
" 'Four days will quickly steep themselves in night ;']"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rdd.flatMap(lambda x: x.split()).take(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"['A',\n",
" \"MIDSUMMER-NIGHT'S\",\n",
" 'DREAM',\n",
" 'Now',\n",
" ',',\n",
" 'fair',\n",
" 'Hippolyta',\n",
" ',',\n",
" 'our',\n",
" 'nuptial']"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"wc = (rdd.flatMap(lambda x: x.split())\n",
" .map(lambda x: (x, 1))\n",
" .reduceByKey(lambda x, y: x + y)\n",
" .top(10, key=lambda x: x[1]))\n",
"print wc"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-08-07 14:17:22,021 [INFO] [dpark.schedule] Got job 1 with 3 tasks: <MappedRDD <FlatMappedRDD <TextFileRDD /home2/zhuzhaolong/testdata/shakespeare.txt>>>\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-08-07 14:17:24,872 [INFO] [dpark.job] Job 1 finished in 2.9s: min=0.3s, avg=1.3s, max=2.1s, maxtry=1\n",
"2015-08-07 14:17:24,872 [INFO] [dpark.job] read 2.4MB (0% localized)\n",
"2015-08-07 14:17:24,962 [INFO] [dpark.schedule] Got job 2 with 3 tasks: <ShuffledRDD <MappedRDD <FlatMappedRDD <TextFileRDD /home2/zhuzhaolong/testdata/shakespeare.txt>>>>\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[(',', 81827), ('.', 36514), ('the', 23272), ('I', 20041), (';', 17274), ('and', 16817), ('to', 15506), ('of', 15037), ('you', 12361), ('a', 12155)]\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"[khand1] 2015-08-07 14:17:26,060 [WARNING] [dpark.executor@khand1] cwd (/home/zhuzhaolong) not exists\n",
"2015-08-07 14:17:26,442 [INFO] [dpark.job] Job 2 finished in 1.5s: min=0.0s, avg=0.1s, max=0.1s, maxtry=1\n",
"2015-08-07 14:17:26,442 [INFO] [dpark.job] read 2.1MB (0% localized)\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## API"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dir(rdd)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"['DEFAULT_SPLIT_SIZE',\n",
" '__class__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__doc__',\n",
" '__format__',\n",
" '__getattribute__',\n",
" '__getslice__',\n",
" '__getstate__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__iter__',\n",
" '__len__',\n",
" '__module__',\n",
" '__new__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" '_join',\n",
" '_partitioner',\n",
" '_preferredLocations',\n",
" '_splits',\n",
" 'adcount',\n",
" 'adcountByKey',\n",
" 'aggregate',\n",
" 'aggregator',\n",
" 'asTable',\n",
" 'batch',\n",
" 'cache',\n",
" 'cartesian',\n",
" 'cogroup',\n",
" 'collect',\n",
" 'collectAsMap',\n",
" 'combineByKey',\n",
" 'compute',\n",
" 'count',\n",
" 'ctx',\n",
" 'dependencies',\n",
" 'enumerate',\n",
" 'enumeratePartition',\n",
" 'err',\n",
" 'fileinfo',\n",
" 'filter',\n",
" 'first',\n",
" 'flatMap',\n",
" 'flatMapValue',\n",
" 'fold',\n",
" 'foreach',\n",
" 'foreachPartition',\n",
" 'fromCsv',\n",
" 'glom',\n",
" 'groupBy',\n",
" 'groupByKey',\n",
" 'groupWith',\n",
" 'hot',\n",
" 'id',\n",
" 'innerJoin',\n",
" 'iterator',\n",
" 'join',\n",
" 'leftOuterJoin',\n",
" 'len',\n",
" 'lookup',\n",
" 'map',\n",
" 'mapPartition',\n",
" 'mapPartitions',\n",
" 'mapValue',\n",
" 'mem',\n",
" 'mergeSplit',\n",
" 'newId',\n",
" 'nextId',\n",
" 'open_file',\n",
" 'outerJoin',\n",
" 'partitionByKey',\n",
" 'partitioner',\n",
" 'path',\n",
" 'pipe',\n",
" 'preferredLocations',\n",
" 'read',\n",
" 'reduce',\n",
" 'reduceByKey',\n",
" 'reduceByKeyToDriver',\n",
" 'rightOuterJoin',\n",
" 'sample',\n",
" 'saveAsBeansdb',\n",
" 'saveAsBinaryFile',\n",
" 'saveAsCSVFile',\n",
" 'saveAsTableFile',\n",
" 'saveAsTabular',\n",
" 'saveAsTextFile',\n",
" 'saveAsTextFileByKey',\n",
" 'shouldCache',\n",
" 'size',\n",
" 'snapshot',\n",
" 'snapshot_path',\n",
" 'sort',\n",
" 'splitSize',\n",
" 'splits',\n",
" 'take',\n",
" 'toList',\n",
" 'top',\n",
" 'union',\n",
" 'uniq',\n",
" 'update',\n",
" 'zipWith']"
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment