Skip to content

Instantly share code, notes, and snippets.

View gbraccialli's full-sized avatar

Gui Braccialli gbraccialli

View GitHub Profile
git clone https://github.com/jupyter-scala/jupyter-scala.git
./jupyter-scala
#inside jupyter
import $exclude.`org.slf4j:slf4j-log4j12`, $ivy.`org.slf4j:slf4j-nop:1.7.21` // for cleaner logs
import $profile.`hadoop-2.6`
import $ivy.`org.apache.spark::spark-sql:2.1.0` // adjust spark version - spark >= 2.0
import $ivy.`org.apache.hadoop:hadoop-aws:2.6.4`
import $ivy.`org.jupyter-scala::spark:0.4.2` // for JupyterSparkSession (SparkSession aware of the jupyter-scala kernel)
import org.apache.spark.sql.functions._
val df = spark
.read
.option("inferSchema", "true")
.option("header", "true")
.option("delimiter", ";")
.csv("/Users/guilherme.braccialli/Desktop/simulado_1000_20k.csv")
val Array(dfTrain, dfTest) = df.randomSplit(Array(0.7, 0.3), seed=3)
import scala.collection.parallel._
val df = (1 to 10).toDF
val list = Seq(1 to 300).par
list.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(2))
val listR = list.map(l => df.withColumn("l", lit(l.toString)).groupBy("l").count.collect)
//%python
//import base64
//html64 = base64.b64encode("""
<div id=a>adfadfas</div>
<script>
function test(msg){
document.getElementById('a').innerHTML = msg;
}
import sys, requests, json, time, datetime
from subprocess import call
def getLastEndJob(url, startTime):
resp = requests.get(url + "/api/v1/applications")
resp.encoding = 'utf-8'
sparkuis = resp.json()
kill = True
msg = ""
yarn node -list|sed -n "s/^\(ip[^:]*\):.*/\1/p" > /home/hadoop/nodes.txt
< /home/hadoop/nodes.txt xargs -t -I{} -P10 scp -o StrictHostKeyChecking=no /etc/hadoop/conf/mapred-site.xml_worker {}:/tmp/mapred-site.xml
< /home/hadoop/nodes.txt xargs -t -I{} -P10 ssh -o StrictHostKeyChecking=no {} "sudo cp -f /tmp/mapred-site.xml /etc/hadoop/conf/mapred-site.xml"
< /home/hadoop/nodes.txt xargs -t -I{} -P10 scp -o StrictHostKeyChecking=no /etc/hadoop/conf/yarn-site.xml_worker {}:/tmp/yarn-site.xml
< /home/hadoop/nodes.txt xargs -t -I{} -P10 ssh -o StrictHostKeyChecking=no {} "sudo cp -f /tmp/yarn-site.xml /etc/hadoop/conf/yarn-site.xml"
< /home/hadoop/nodes.txt xargs -t -I{} -P10 ssh -o StrictHostKeyChecking=no {} "sudo stop hadoop-yarn-nodemanager"
spark.master yarn
spark.dynamicAllocation.enabled true
spark.executor.memory 20G
spark.executor.cores 4
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2
spark.yarn.am.memory 2G
spark.dynamicAllocation.cachedExecutorIdleTimeout 60s
spark.yarn.executor.memoryOverhead 3G
spark.dynamicAllocation.executorIdleTimeout 60s
spark.driver.memory 10G
@gbraccialli
gbraccialli / jupyter_spark.sh
Last active February 18, 2018 01:47
jupyter_spark.sh
#option 1 - start jupyter using pyspark
export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
export PYSPARK_DRIVER_PYTHON=/mnt/lib/python/anaconda2/bin/ipython
pyspark --queue queue3
#option 2 - vanilla jupyter with jars
import os
os.environ["SPARK_HOME"] = "/Downloads/spark-2.2.1-bin-hadoop2.7/"
os.environ["SPARK_CLASSPATH"] = "/tmp/shared/postgresql-42.2.1.jar"
spark.hadoop.fs.s3a.experimental.input.fadvise random
spark.hadoop.fs.s3a.readahead.range 67108864
spark.hadoop.fs.s3a.connection.maximum 200
spark.hadoop.fs.s3a.connection.establish.timeout 2000000
spark.hadoop.fs.s3a.connection.timeout 2000000