Created
February 18, 2016 01:56
-
-
Save saptak/fac53e0aa2171e0d81a5 to your computer and use it in GitHub Desktop.
Week 4 solution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"flightRdd=sc.textFile(\"/tmp/flights.csv\") \\\n", | |
".map(lambda line: line.split(\",\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"carrierRdd = flightRdd.map(lambda line: (line[5],1))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(u'WN', 1)]\"" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"carrierRdd.take(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cReducedRdd = carrierRdd.reduceByKey(lambda a,b: a+b)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"carriersSorted = cReducedRdd.map(lambda (a,b): (b,a)) \\\n", | |
".sortByKey(ascending=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(11807, u'WN'), (5819, u'AA'), (5550, u'OO')]\"" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"carriersSorted.take(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"airportsRdd = sc.textFile(\"/tmp/airports.csv\") \\\n", | |
".map(lambda line: line.split(\",\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cityRdd = airportsRdd.map(lambda line: (line[0].strip('\"'), line[2].strip('\"')))\n", | |
"flightOrigDestRdd = flightRdd \\\n", | |
".map(lambda line: (line[12], line[13]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(u'iata', u'city'), (u'ZZV', u'Zanesville'), (u'ZUN', u'Zuni')]\"" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cityRdd.top(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"origJoinRdd = flightOrigDestRdd.join(cityRdd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"destAndOrigJoinRdd = origJoinRdd \\\n", | |
".map(lambda (a,b): (b[0],b[1])).join(cityRdd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"citiesCleanRdd = destAndOrigJoinRdd.values()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u''" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"citiesReducedRdd = citiesCleanRdd \\\n", | |
".map(lambda line: (line,1)).reduceByKey(lambda a,b: a+b)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(164, (u'New York', u'Boston')), (162, (u'Boston', u'New York')), (150, (u'New York', u'Arlington')), (140, (u'Los Angeles', u'San Diego')), (137, (u'Los Angeles', u'San Francisco'))]\"" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"citiesReducedRdd.map(lambda (a,b): \\\n", | |
"(b,a)).sortByKey(ascending=False).take(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(u'AA', 953), (u'OO', 499), (u'DL', 455), (u'CO', 759), (u'UA', 960), (u'9E', 335), (u'AS', 418), (u'US', 757), (u'AQ', 300), (u'B6', 422)]\"" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"flightRdd.filter(lambda line: int(line[11]) > 15) \\\n", | |
".map(lambda line: (line[5], line[11])) \\\n", | |
".reduceByKey(lambda a,b: max(int(a),int(b))).take(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"[(950, u'A320-232'), (747, u'737-7H4')]\"" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"airplanesRdd = sc.textFile(\"/tmp/plane-data.csv\") \\\n", | |
".map(lambda line: line.split(\",\")) \\\n", | |
".filter(lambda line:len(line) == 9)\n", | |
"flight15Rdd = flightRdd \\\n", | |
".filter(lambda line: int(line[14]) > 1500) \\\n", | |
".map(lambda line: (line[7],1))\n", | |
"tailModelRdd = airplanesRdd \\\n", | |
".map(lambda line: (line[0],line[4]))\n", | |
"flight15Rdd.join(tailModelRdd) \\\n", | |
".map(lambda (a,b): (b[1],b[0])) \\\n", | |
".reduceByKey(lambda a,b: a+b) \\\n", | |
".map(lambda (a,b): (b,a)).sortByKey(ascending=False).take(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "PySpark", | |
"language": "", | |
"name": "pysparkkernel" | |
}, | |
"language_info": { | |
"mimetype": "text/x-python", | |
"name": "pyspark" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment