Skip to content

Instantly share code, notes, and snippets.

View gbraccialli's full-sized avatar

Gui Braccialli gbraccialli

View GitHub Profile
yarn-site.xml
yarn.resourcemanager.scheduler.monitor.enable=true
yarn.resourcemanager.monitor.capacity.preemption.max_ignored_over_capacity=0.01
yarn.resourcemanager.monitor.capacity.preemption.max_wait_before_kill=1000
yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval=1000
yarn.resourcemanager.monitor.capacity.preemption.natural_termination_factor=1
yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round=1
yarn.resourcemanager.scheduler.monitor.policies=org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy
capacity-scheduler.xml
git clone https://github.com/databricks/spark-sql-perf.git
cd spark-sql-perf
sbt assembly
git clone https://github.com/davies/tpcds-kit
sudo yum groupinstall "Development Tools"
cd tpcds-kit/tools
cp Makefile.suite Makefile
make
yum install mysql-connector-java mysql-server
mysql -u root
create database metastore;
CREATE USER 'metastore'@'%' IDENTIFIED BY 'metastore';
CREATE USER 'metastore'@'localhost' IDENTIFIED BY 'metastore';
GRANT ALL PRIVILEGES ON metastore.* TO 'metastore'@'%';
GRANT ALL PRIVILEGES ON metastore.* TO 'metastore'@'localhost';
case class Person (name: String, age: Int)
val df = sc.parallelize(List(Person("Guilherme", 35), Person("Isabela", 6), Person("Daniel", 3))).toDF
def wordsLengthScala(a: Any): Array[Int] = {
a match {
case s: String => {s.split(" ").map(s => s.length)}
case i: Integer => {Array(i)}
}
git add *
git commit -m "changes..."
git push
git checkout v1.1
git pull
git remote set-url origin https://[email protected]
git config credential.helper store
import org.apache.spark.sql.SQLContext
import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.avro.Schema
import com.databricks.spark.avro._
import scala.collection.JavaConversions._
import scala.util.matching.Regex
import java.io.File
val jdbcDF = spark.read.format("jdbc").options(
Map(
"driver" -> "org.postgresql.Driver",
"url" -> "jdbc:postgresql://localhost/?user=postgres&password=postgres",
"dbtable" -> "nifi_test"
)
).load()
jdbcDF.show
case class Test (typet: String, value: Int)
val test = List(Test("B", 99), Test("B", 2), Test("B", 35), Test("A", 6), Test("A", 3))
val rdd = sc.parallelize(test)
val df = rdd.toDF
var i = 0
var previous = ""
def udf_buffer(in: String): Option[Int] =
{
if (in.equals(previous))
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples.jar teragen 10000000000 /tmp/teragenout
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples.jar terasort -D mapred.reduce.tasks=170 -D mapred.job.queue.name=ds /tmp/teragenout /tmp/terasortout
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 100 -fileSize 5000
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=3 -D -write -nrFiles 100 -fileSize 5000
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 200 -fileSize 5000
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 600 -fileSize 2000
hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar
def time[T](block: => T): T = {
val start = System.currentTimeMillis
val res = block
val totalTime = System.currentTimeMillis - start
println("Elapsed time: %1d seconds".format(totalTime/1000))
res
}
//spark-shell --conf spark.memory.storageFraction=0 --conf spark.memory.fraction=0.1
//spark-shell --conf spark.serializer=org.apache.spark.serializer.KryoSerializer