gbraccialli’s gists

gbraccialli / capacity_scheduler_preemption.txt

Last active May 18, 2017 13:30

	yarn-site.xml
	yarn.resourcemanager.scheduler.monitor.enable=true
	yarn.resourcemanager.monitor.capacity.preemption.max_ignored_over_capacity=0.01
	yarn.resourcemanager.monitor.capacity.preemption.max_wait_before_kill=1000
	yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval=1000
	yarn.resourcemanager.monitor.capacity.preemption.natural_termination_factor=1
	yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round=1
	yarn.resourcemanager.scheduler.monitor.policies=org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy

	capacity-scheduler.xml

gbraccialli / spark_tpc_ds.scala

Last active April 27, 2017 11:44

	git clone https://github.com/databricks/spark-sql-perf.git
	cd spark-sql-perf
	sbt assembly

	git clone https://github.com/davies/tpcds-kit
	sudo yum groupinstall "Development Tools"
	cd tpcds-kit/tools
	cp Makefile.suite Makefile
	make

gbraccialli / spark_standalone_metastore_history_settings.txt

Last active May 18, 2017 11:13

	yum install mysql-connector-java mysql-server

	mysql -u root
	create database metastore;
	CREATE USER 'metastore'@'%' IDENTIFIED BY 'metastore';
	CREATE USER 'metastore'@'localhost' IDENTIFIED BY 'metastore';
	GRANT ALL PRIVILEGES ON metastore.* TO 'metastore'@'%';
	GRANT ALL PRIVILEGES ON metastore.* TO 'metastore'@'localhost';

gbraccialli / spark_udf_overload.scala

Last active April 5, 2017 22:36

	case class Person (name: String, age: Int)

	val df = sc.parallelize(List(Person("Guilherme", 35), Person("Isabela", 6), Person("Daniel", 3))).toDF


	def wordsLengthScala(a: Any): Array[Int] = {
	a match {
	case s: String => {s.split(" ").map(s => s.length)}
	case i: Integer => {Array(i)}
	}

gbraccialli / git

Last active March 25, 2020 04:54

	git add *
	git commit -m "changes..."
	git push

	git checkout v1.1
	git pull

	git remote set-url origin https://[email protected]

	git config credential.helper store

gbraccialli / spark_read_csv_with_avro_schema.scala

Last active March 21, 2017 23:34

	import org.apache.spark.sql.SQLContext
	import org.apache.spark.util.LongAccumulator
	import org.apache.spark.sql.types._
	import org.apache.spark.sql._
	import org.apache.avro.Schema
	import com.databricks.spark.avro._
	import scala.collection.JavaConversions._
	import scala.util.matching.Regex
	import java.io.File

gbraccialli / spark_jdbc_read_write.scala

Created February 3, 2017 08:58

	val jdbcDF = spark.read.format("jdbc").options(
	Map(
	"driver" -> "org.postgresql.Driver",
	"url" -> "jdbc:postgresql://localhost/?user=postgres&password=postgres",
	"dbtable" -> "nifi_test"
	)
	).load()

	jdbcDF.show

gbraccialli / scala_udf_buffer.scala

Last active June 8, 2017 02:03

	case class Test (typet: String, value: Int)
	val test = List(Test("B", 99), Test("B", 2), Test("B", 35), Test("A", 6), Test("A", 3))
	val rdd = sc.parallelize(test)
	val df = rdd.toDF

	var i = 0
	var previous = ""
	def udf_buffer(in: String): Option[Int] =
	{
	if (in.equals(previous))

gbraccialli / hadoop_benchmarks.txt

Created January 23, 2017 15:28

	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples.jar teragen 10000000000 /tmp/teragenout
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples.jar terasort -D mapred.reduce.tasks=170 -D mapred.job.queue.name=ds /tmp/teragenout /tmp/terasortout

	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 100 -fileSize 5000
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=3 -D -write -nrFiles 100 -fileSize 5000
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 200 -fileSize 5000
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar TestDFSIO -D dfs.replication=1 -D -write -nrFiles 600 -fileSize 2000
	hadoop jar /usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient.jar

gbraccialli / spark_performance_test.scala

Last active October 25, 2018 01:41

	def time[T](block: => T): T = {
	val start = System.currentTimeMillis
	val res = block
	val totalTime = System.currentTimeMillis - start
	println("Elapsed time: %1d seconds".format(totalTime/1000))
	res
	}

	//spark-shell --conf spark.memory.storageFraction=0 --conf spark.memory.fraction=0.1
	//spark-shell --conf spark.serializer=org.apache.spark.serializer.KryoSerializer

Gui Braccialli gbraccialli