colltoaction · August 28, 2015 04:23
diff --git a/district-means.ipynb b/district-means.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generating the means by district"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We start by setting up the environment so it uses Python 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['PYSPARK_PYTHON'] = 'python2'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We create the spark context and sql context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pyspark\n",
    "from pyspark import HiveContext, SparkContext\n",
    "\n",
    "sc = SparkContext('local[*]')\n",
    "sqlCtx = HiveContext(sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We create the dataframe that uses the pyspark_csv module to parse a CSV file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pyspark_csv as pycsv\n",
    "\n",
    "plaintext_rdd = sc.textFile('../data/train.csv')\n",
    "data = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We finally run the MapReduce processes to obtain the means by district"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PdDistrict,X,Y\n",
      "CENTRAL,-122.409507348,37.7987397043\n",
      "SOUTHERN,-122.405185063,37.7825727891\n",
      "PARK,-122.445368775,37.7724177678\n",
      "RICHMOND,-122.469782266,37.7882938998\n",
      "TARAVAL,-122.477213993,37.740736043\n",
      "BAYVIEW,-122.393359168,37.7425158149\n",
      "INGLESIDE,-122.428733579,37.7291949667\n",
      "MISSION,-122.419392917,37.7603962752\n",
      "TENDERLOIN,-122.412152411,37.7933765424\n",
      "NORTHERN,-122.426427669,37.7923298653\n"
     ]
    }
   ],
   "source": [
    "lines = data.map(lambda line: (line.PdDistrict, (float(line.X), 1, float(line.Y), 1))) \\\n",
    "            .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3])) \\\n",
    "            .map(lambda x: (x[0], (x[1][0] / x[1][1], x[1][2] / x[1][3]))) \\\n",
    "            .map(lambda x: x[0] + \",\" + str(x[1][0]) + \",\" + str(x[1][1])) \\\n",
    "            .collect()\n",
    "print \"\\n\".join(['PdDistrict,X,Y'] + lines)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Generating the means by district"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We start by setting up the environment so it uses Python 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import os\n",
	"os.environ['PYSPARK_PYTHON'] = 'python2'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We create the spark context and sql context"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pyspark\n",
	"from pyspark import HiveContext, SparkContext\n",
	"\n",
	"sc = SparkContext('local[*]')\n",
	"sqlCtx = HiveContext(sc)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We create the dataframe that uses the pyspark_csv module to parse a CSV file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pyspark_csv as pycsv\n",
	"\n",
	"plaintext_rdd = sc.textFile('../data/train.csv')\n",
	"data = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We finally run the MapReduce processes to obtain the means by district"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"PdDistrict,X,Y\n",
	"CENTRAL,-122.409507348,37.7987397043\n",
	"SOUTHERN,-122.405185063,37.7825727891\n",
	"PARK,-122.445368775,37.7724177678\n",
	"RICHMOND,-122.469782266,37.7882938998\n",
	"TARAVAL,-122.477213993,37.740736043\n",
	"BAYVIEW,-122.393359168,37.7425158149\n",
	"INGLESIDE,-122.428733579,37.7291949667\n",
	"MISSION,-122.419392917,37.7603962752\n",
	"TENDERLOIN,-122.412152411,37.7933765424\n",
	"NORTHERN,-122.426427669,37.7923298653\n"
	]
	}
	],
	"source": [
	"lines = data.map(lambda line: (line.PdDistrict, (float(line.X), 1, float(line.Y), 1))) \\\n",
	" .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3])) \\\n",
	" .map(lambda x: (x[0], (x[1][0] / x[1][1], x[1][2] / x[1][3]))) \\\n",
	" .map(lambda x: x[0] + \",\" + str(x[1][0]) + \",\" + str(x[1][1])) \\\n",
	" .collect()\n",
	"print \"\\n\".join(['PdDistrict,X,Y'] + lines)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}