Skip to content

Instantly share code, notes, and snippets.

@gbraccialli
Last active March 30, 2019 05:47
Show Gist options
  • Save gbraccialli/0d03c987ce43fc9070c32818c5c688ab to your computer and use it in GitHub Desktop.
Save gbraccialli/0d03c987ce43fc9070c32818c5c688ab to your computer and use it in GitHub Desktop.
# AWS EXAMPLE
pyspark --packages org.apache.hadoop:hadoop-aws:2.7.5
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("fs.s3a.access.key","XXXXXXXXXXXX")
spark.conf.set("fs.s3a.secret.key","XXXXXXXXXXXXXX")
df = spark.read.csv("s3a://yourbucket/tmp_o3001uc.csv")
# AZURE BLOB EXAMPLE
pyspark --packages org.apache.hadoop:hadoop-azure:2.7.5,com.microsoft.azure:azure-storage:7.0.0
spark.conf.set("fs.wasb.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure.account.key.yourstorageaccount.blob.core.windows.net","XXXXXXXXXXXXXXXXXXXX")
df = spark.read.csv("wasb://[email protected]/x.txt")
#AZURE DATA LAKE
#https://stackoverflow.com/questions/50399751/load-data-from-azure-data-lake-to-jupyter-notebook-on-dsvm/50537989
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config("spark.jars.packages", "com.microsoft.azure:azure-data-lake-store-sdk:2.1.5,org.apache.hadoop:hadoop-azure-datalake:3.0.0-alpha3")\
.getOrCreate()
spark.conf.set("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem")
spark.conf.set("fs.AbstractFileSystem.adl.impl","org.apache.hadoop.fs.adl.Adl")
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential")
spark.conf.set("dfs.adls.oauth2.client.id", "7fc5c341c7..........")
spark.conf.set("dfs.adls.oauth2.credential", "PwhC6Ifk39............")
spark.conf.set("dfs.adls.oauth2.refresh.url", "https://login.microsoftonline.com/c854542........./oauth2/token")
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential")
spark.conf.set("dfs.adls.oauth2.client.id", credentials['azure']['client_id'])
spark.conf.set("dfs.adls.oauth2.credential", credentials['azure']['secret'])
spark.conf.set("dfs.adls.oauth2.refresh.url", credentials['azure']['token_url'])
# S3 WITH KMS, needs to use latest hadoop
1- download spark from here: https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-without-hadoop.tgz
2- download hadoop from here: https://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
3- unzip both packages
4- run this code below as FIRST thing in your python code
import os
#get classpath running: "hadoop classpath" on your hadoop download directory
os.environ['SPARK_DIST_CLASSPATH'] = '/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/etc/hadoop:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/common/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/common/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/mapreduce/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/mapreduce/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn/'
import findspark
findspark.init('/Users/guilherme_braccialli/Downloads/spark-2.3.1-bin-without-hadoop')
aws_access_key_id = "xxxx"
aws_secret_access_key = "xxxx"
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.config('spark.driver.memory', '8g') \
.config('spark.driver.maxResultSize', '4g') \
.master('local[*]') \
.config("spark.hadoop.fs.s3a.impl",
"org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.1.1") \
.config("fs.s3a.access.key", aws_access_key_id) \
.config("fs.s3a.secret.key", aws_secret_access_key) \
.config("spark.sql.execution.arrow.enabled", "true")\
.appName('xxx')\
.getOrCreate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment