Created
August 28, 2015 04:23
-
-
Save colltoaction/e62bdefb4d3a2ff8d85d to your computer and use it in GitHub Desktop.
Generating location means by district for kaggle.com/c/sf-crime using pyspark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Generating the means by district" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We start by setting up the environment so it uses Python 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"os.environ['PYSPARK_PYTHON'] = 'python2'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We create the spark context and sql context" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pyspark\n", | |
"from pyspark import HiveContext, SparkContext\n", | |
"\n", | |
"sc = SparkContext('local[*]')\n", | |
"sqlCtx = HiveContext(sc)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We create the dataframe that uses the pyspark_csv module to parse a CSV file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pyspark_csv as pycsv\n", | |
"\n", | |
"plaintext_rdd = sc.textFile('../data/train.csv')\n", | |
"data = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We finally run the MapReduce processes to obtain the means by district" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"PdDistrict,X,Y\n", | |
"CENTRAL,-122.409507348,37.7987397043\n", | |
"SOUTHERN,-122.405185063,37.7825727891\n", | |
"PARK,-122.445368775,37.7724177678\n", | |
"RICHMOND,-122.469782266,37.7882938998\n", | |
"TARAVAL,-122.477213993,37.740736043\n", | |
"BAYVIEW,-122.393359168,37.7425158149\n", | |
"INGLESIDE,-122.428733579,37.7291949667\n", | |
"MISSION,-122.419392917,37.7603962752\n", | |
"TENDERLOIN,-122.412152411,37.7933765424\n", | |
"NORTHERN,-122.426427669,37.7923298653\n" | |
] | |
} | |
], | |
"source": [ | |
"lines = data.map(lambda line: (line.PdDistrict, (float(line.X), 1, float(line.Y), 1))) \\\n", | |
" .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3])) \\\n", | |
" .map(lambda x: (x[0], (x[1][0] / x[1][1], x[1][2] / x[1][3]))) \\\n", | |
" .map(lambda x: x[0] + \",\" + str(x[1][0]) + \",\" + str(x[1][1])) \\\n", | |
" .collect()\n", | |
"print \"\\n\".join(['PdDistrict,X,Y'] + lines)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment