vidma

spark notes

sc. [tab] - show available methods

issues

dependency issues? parquet avro, ... specificRecord not present...

and remember this is much better than git pull master on a feature branch:

# use git pull on in master branch! so it will never create nasty merge commits
git checkout master
git pull origin master

# development
git checkout -b your_feature_branch

# rebase your changes over changed master:

	trait T {
	val v = 3

	// if v is overriden, and this is val, this yields not what you'd expect
	// lazy val or def would work ok.
	val version = v
	}

	class C extends T {
	override val v = 2

	implicit class DataFrameExtended(df: DataFrame) {
	import df.sqlContext.implicits._

	def anyNull(cols: Seq[Column]): Column = cols.map(_.isNull).reduce (_ \|\| _)

	/**
	* LEFT JOIN should not join anything when join-key contains a NULL (but usually this
	* would result in shuffling NULL keyed items into single or few reducers).
	* This can be easily fixed by adding an additional temporary join condition that:
	* - is a random seed when any of the keys is null, thus addressing the NULL skew

	import org.apache.spark.sql.DataFrame

	implicit class DataFrameWithPivot(df: DataFrame) {
	/**
	* Transposes metrics in multiple columns into multiple rows, with one metric per row
	* (a.k.a unPivot)
	*
	* Given:
	* \|------\|----------\|--------\|
	* \| dim1 \| new_users\| buyers \|

	import org.apache.spark.sql.{DataFrame, SQLContext}
	import org.apache.spark.sql.hive.HiveContext

	val sqlCtx: SQLContext = new HiveContext(sc)

	sqlCtx.sql("select 1")