Last active
March 30, 2019 05:47
-
-
Save gbraccialli/0d03c987ce43fc9070c32818c5c688ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# AWS EXAMPLE | |
pyspark --packages org.apache.hadoop:hadoop-aws:2.7.5 | |
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") | |
spark.conf.set("fs.s3a.access.key","XXXXXXXXXXXX") | |
spark.conf.set("fs.s3a.secret.key","XXXXXXXXXXXXXX") | |
df = spark.read.csv("s3a://yourbucket/tmp_o3001uc.csv") | |
# AZURE BLOB EXAMPLE | |
pyspark --packages org.apache.hadoop:hadoop-azure:2.7.5,com.microsoft.azure:azure-storage:7.0.0 | |
spark.conf.set("fs.wasb.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem") | |
spark.conf.set("fs.azure.account.key.yourstorageaccount.blob.core.windows.net","XXXXXXXXXXXXXXXXXXXX") | |
df = spark.read.csv("wasb://[email protected]/x.txt") | |
#AZURE DATA LAKE | |
#https://stackoverflow.com/questions/50399751/load-data-from-azure-data-lake-to-jupyter-notebook-on-dsvm/50537989 | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder\ | |
.config("spark.jars.packages", "com.microsoft.azure:azure-data-lake-store-sdk:2.1.5,org.apache.hadoop:hadoop-azure-datalake:3.0.0-alpha3")\ | |
.getOrCreate() | |
spark.conf.set("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem") | |
spark.conf.set("fs.AbstractFileSystem.adl.impl","org.apache.hadoop.fs.adl.Adl") | |
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential") | |
spark.conf.set("dfs.adls.oauth2.client.id", "7fc5c341c7..........") | |
spark.conf.set("dfs.adls.oauth2.credential", "PwhC6Ifk39............") | |
spark.conf.set("dfs.adls.oauth2.refresh.url", "https://login.microsoftonline.com/c854542........./oauth2/token") | |
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential") | |
spark.conf.set("dfs.adls.oauth2.client.id", credentials['azure']['client_id']) | |
spark.conf.set("dfs.adls.oauth2.credential", credentials['azure']['secret']) | |
spark.conf.set("dfs.adls.oauth2.refresh.url", credentials['azure']['token_url']) | |
# S3 WITH KMS, needs to use latest hadoop | |
1- download spark from here: https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-without-hadoop.tgz | |
2- download hadoop from here: https://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz | |
3- unzip both packages | |
4- run this code below as FIRST thing in your python code | |
import os | |
#get classpath running: "hadoop classpath" on your hadoop download directory | |
os.environ['SPARK_DIST_CLASSPATH'] = '/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/etc/hadoop:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/common/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/common/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/hdfs/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/mapreduce/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/mapreduce/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn/lib/:/Users/guilherme_braccialli/Downloads/hadoop-3.1.1/share/hadoop/yarn/' | |
import findspark | |
findspark.init('/Users/guilherme_braccialli/Downloads/spark-2.3.1-bin-without-hadoop') | |
aws_access_key_id = "xxxx" | |
aws_secret_access_key = "xxxx" | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder \ | |
.config('spark.driver.memory', '8g') \ | |
.config('spark.driver.maxResultSize', '4g') \ | |
.master('local[*]') \ | |
.config("spark.hadoop.fs.s3a.impl", | |
"org.apache.hadoop.fs.s3a.S3AFileSystem") \ | |
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.1.1") \ | |
.config("fs.s3a.access.key", aws_access_key_id) \ | |
.config("fs.s3a.secret.key", aws_secret_access_key) \ | |
.config("spark.sql.execution.arrow.enabled", "true")\ | |
.appName('xxx')\ | |
.getOrCreate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment