Created
June 23, 2016 21:18
-
-
Save bhtucker/927fc24e3f48c0224b90b2a72fb30b72 to your computer and use it in GitHub Desktop.
PySpark Notebook for Spark Implementation of https://gist.github.com/bhtucker/5dccc1fa96d8030a752cb9f76cbaf558
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<pyspark.context.SparkContext at 0x10dc61690>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sc" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ok\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"ok\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"keyed_rdd = sc.parallelize([\n", | |
" ('a', 2),\n", | |
" ('a', 2),\n", | |
" ('a', 2),\n", | |
" ('b', 2), \n", | |
" ('b', 2)\n", | |
" ])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('a', [2, 2, 2]), ('b', [2, 2])]" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"keyed_rdd.reduceByKey(lambda a, b: a + b).collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"keyed_rdd = sc.parallelize([\n", | |
" ('a', [2]),\n", | |
" ('a', [2]),\n", | |
" ('a', [2]),\n", | |
" ('b', [2]), \n", | |
" ('b', [2])\n", | |
" ])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"reduce(lambda a, b: a + b, range(5))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum(range(5))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('a', [2], 'a', [2], 'a', [2], 'b', [2], 'b', [2])" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"keyed_rdd.reduce(lambda a, b: a + b)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"phrase_rdd = sc.parallelize([\n", | |
" 'one whole sentence',\n", | |
" 'another sentence entirely'\n", | |
" ])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"phrase_rdd.count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['one', 'whole', 'sentence'], ['another', 'sentence', 'entirely']]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"phrase_rdd.map(lambda v: v.split()).count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[3, 3]" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"phrase_rdd.map(lambda v: v.split()).map(len).collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['one', 'whole', 'sentence', 'another', 'sentence', 'entirely']" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"phrase_rdd.flatMap(lambda v: v.split()).collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"6" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"phrase_rdd.flatMap(lambda v: v.split()).count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('whole', 1), ('entirely', 1), ('another', 1), ('one', 1), ('sentence', 2)]" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word_rdd = phrase_rdd.flatMap(lambda v: v.split())\n", | |
"\n", | |
"word_keyed_rdd = word_rdd.map(lambda v: (v, 1))\n", | |
"\n", | |
"word_keyed_rdd.reduceByKey(lambda a, b: a + b).collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"defaultdict(int,\n", | |
" {'another': 1, 'entirely': 1, 'one': 1, 'sentence': 2, 'whole': 1})" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"word_rdd.countByValue()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"puzzle_rdd = sc.parallelize(\n", | |
" [(0, 0)]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import operator\n", | |
"\n", | |
"def map_to_children(row, delta_values=[3, 5], delta_ops=[operator.add, operator.sub], current_level=None):\n", | |
" value, level = row\n", | |
" if current_level is not None and level != current_level:\n", | |
" return [row]\n", | |
" return [\n", | |
" (op(value, delta), level + 1)\n", | |
" for op in delta_ops\n", | |
" for delta in delta_values\n", | |
" if 0 <= op(value, delta) <= 8\n", | |
" ]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(1, 3), (1, 5)]" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"map_to_children((0, 0))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[(0, 0), (3, 1), (5, 1)]\n", | |
"[(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
"[(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n" | |
] | |
} | |
], | |
"source": [ | |
"from functools import partial\n", | |
"\n", | |
"puzzle_rdd = sc.parallelize(\n", | |
" [(0, 0)]\n", | |
")\n", | |
"\n", | |
"prior_count = -10\n", | |
"next_count = -5\n", | |
"level = 0\n", | |
"\n", | |
"while prior_count != next_count:\n", | |
" # propagate level and collect dupes\n", | |
" prior_level = puzzle_rdd\n", | |
" prior_count = prior_level.count()\n", | |
" next_level = puzzle_rdd.flatMap(lambda v: map_to_children(v, current_level=level)).cache()\n", | |
" next_count = next_level.count()\n", | |
" puzzle_rdd = prior_level.union(next_level)\n", | |
" puzzle_rdd = puzzle_rdd.reduceByKey(lambda a, b: min(a, b))\n", | |
" level += 1\n", | |
" print(puzzle_rdd.collect())\n", | |
"\n", | |
"# .flatMap(map_to_children).flatMap(map_to_children).flatMap(map_to_children).collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(1, 1)" | |
] | |
}, | |
"execution_count": 61, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"prior_count, next_count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[(1, 3), (1, 5)]]" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Row reduction via only getting children for 'frontier' level:\n", | |
"\n", | |
"# 3\n", | |
"# [(0, 0), (3, 1), (5, 1)]\n", | |
"# 10\n", | |
"# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
"# 15\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"# 18\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"# 19\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"\n", | |
"\n", | |
"# 3\n", | |
"# [(0, 0), (3, 1), (5, 1)]\n", | |
"# 11\n", | |
"# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
"# 20\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"# 26\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
"# 29\n", | |
"# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from functools import partial\n", | |
"def addx(a, x=4):\n", | |
" return a + x\n", | |
"\n", | |
"pseudopartial = lambda v: addx(v, x=8)\n", | |
"aa = pseudopartial(2)\n", | |
"realpartial = partial(addx, x=8)\n", | |
"bb = realpartial(2)\n", | |
"assert aa == bb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment