Created
May 31, 2017 16:17
-
-
Save nicor88/7f935d1a7636121f94e83bc68a9c744b to your computer and use it in GitHub Desktop.
Jupyter Pyspark Examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+---+-------+---+------+-----+\n", | |
| "| id| name|age|points|level|\n", | |
| "+---+-------+---+------+-----+\n", | |
| "| 1|Carleen| 24| 245| 5|\n", | |
| "| 2| Steve| 31| 567| 7|\n", | |
| "| 3| Ann| 41| 354| 5|\n", | |
| "| 4| Lars| 30| 156| 3|\n", | |
| "+---+-------+---+------+-----+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df = sqlCtx.createDataFrame([(1, 'Carleen', 24, 245, 5),\n", | |
| " (2, 'Steve', 31, 567, 7),\n", | |
| " (3, 'Ann', 41, 354, 5),\n", | |
| " (4, 'Lars', 30, 156, 3)], ('id', 'name', 'age', 'points', 'level'))\n", | |
| "df.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+---+-----+---+------+-----+\n", | |
| "| id| name|age|points|level|\n", | |
| "+---+-----+---+------+-----+\n", | |
| "| 2|Steve| 31| 567| 7|\n", | |
| "| 3| Ann| 41| 354| 5|\n", | |
| "+---+-----+---+------+-----+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df.where(df['age'] > 30).show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+-------+------------------+-----+-----------------+-----------------+-----------------+\n", | |
| "|summary| id| name| age| points| level|\n", | |
| "+-------+------------------+-----+-----------------+-----------------+-----------------+\n", | |
| "| count| 4| 4| 4| 4| 4|\n", | |
| "| mean| 2.5| null| 31.5| 330.5| 5.0|\n", | |
| "| stddev|1.2909944487358056| null|7.047458170621991|177.2427713617681|1.632993161855452|\n", | |
| "| min| 1| Ann| 24| 156| 3|\n", | |
| "| max| 4|Steve| 41| 567| 7|\n", | |
| "+-------+------------------+-----+-----------------+-----------------+-----------------+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df.describe().show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Read from S3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = sqlContext.read.parquet('s3://us-east-1.elasticmapreduce.samples/flightdata/input')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = df.select('flightdate', 'origin', 'dest', 'airtime', 'distance', 'cancelled', 'securitydelay')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+----------+------+----+-------+--------+---------+-------------+\n", | |
| "|flightdate|origin|dest|airtime|distance|cancelled|securitydelay|\n", | |
| "+----------+------+----+-------+--------+---------+-------------+\n", | |
| "|2007-01-01| MSP| DFW| 123| 852| 0| null|\n", | |
| "|2007-01-01| DFW| MSP| 110| 852| 0| null|\n", | |
| "|2007-01-01| ROC| MSP| 117| 783| 0| null|\n", | |
| "|2007-01-01| MSP| OKC| 102| 695| 0| 0|\n", | |
| "|2007-01-01| MSP| OKC| 101| 695| 0| null|\n", | |
| "|2007-01-01| DTW| LNK| 105| 701| 0| null|\n", | |
| "|2007-01-01| MEM| MSP| 105| 700| 0| 0|\n", | |
| "|2007-01-01| MSP| MDT| 110| 898| 0| null|\n", | |
| "|2007-01-01| MSP| AVL| 110| 861| 0| null|\n", | |
| "|2007-01-01| AVL| MSP| 120| 861| 0| null|\n", | |
| "|2007-01-01| DTW| XNA| 112| 716| 0| 0|\n", | |
| "|2007-01-01| TUL| DTW| 107| 790| 0| 0|\n", | |
| "|2007-01-01| DTW| TUL| 120| 790| 0| null|\n", | |
| "|2007-01-01| AUS| DTW| 161| 1149| 0| null|\n", | |
| "|2007-01-01| BGR| DTW| 132| 750| 0| 0|\n", | |
| "|2007-01-01| BGR| DTW| 119| 750| 0| 0|\n", | |
| "|2007-01-01| IND| FLL| 156| 1005| 0| 0|\n", | |
| "|2007-01-01| FLL| IND| 138| 1005| 0| 0|\n", | |
| "|2007-01-01| DSM| DCA| 112| 897| 0| null|\n", | |
| "|2007-01-01| DCA| DSM| 141| 897| 0| 0|\n", | |
| "+----------+------+----+-------+--------+---------+-------------+\n", | |
| "only showing top 20 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df.where(df['airtime'] > 100).show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment