Created
April 4, 2017 01:01
-
-
Save ahmaurya/22379ce5914d85736d4079f9fee03111 to your computer and use it in GitHub Desktop.
IPython PySpark Setup Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| import os.path | |
| spark_home = os.environ.get('SPARK_HOME', None) | |
| if not spark_home: | |
| raise ValueError('SPARK_HOME environment variable is not set') | |
| sys.path.insert(0, os.path.join(spark_home, 'python')) | |
| sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip')) | |
| if 'sc' not in vars() and 'sc' not in globals(): | |
| execfile(os.path.join(spark_home, 'python/pyspark/shell.py')) | |
| fileName = 'file:///Users/hadoop/data/millionsong.txt' | |
| numPartitions = 2 | |
| rawData = sc.textFile(fileName, numPartitions) | |
| print(rawData.count()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment