spark notes
sc. [tab] - show available methods
- dependency issues? parquet avro, ... specificRecord not present...
trait T { | |
val v = 3 | |
// if v is overriden, and this is val, this yields not what you'd expect | |
// lazy val or def would work ok. | |
val version = v | |
} | |
class C extends T { | |
override val v = 2 |
implicit class DataFrameExtended(df: DataFrame) { | |
import df.sqlContext.implicits._ | |
def anyNull(cols: Seq[Column]): Column = cols.map(_.isNull).reduce (_ || _) | |
/** | |
* LEFT JOIN should not join anything when join-key contains a NULL (but usually this | |
* would result in shuffling NULL keyed items into single or few reducers). | |
* This can be easily fixed by adding an additional temporary join condition that: | |
* - is a random seed when any of the keys is null, thus addressing the NULL skew |
import org.apache.spark.sql.DataFrame | |
implicit class DataFrameWithPivot(df: DataFrame) { | |
/** | |
* Transposes metrics in multiple columns into multiple rows, with one metric per row | |
* (a.k.a unPivot) | |
* | |
* Given: | |
* |------|----------|--------| | |
* | dim1 | new_users| buyers | |
and remember this is much better than git pull master
on a feature branch:
# use git pull on in master branch! so it will never create nasty merge commits
git checkout master
git pull origin master
# development
git checkout -b your_feature_branch
# rebase your changes over changed master:
import org.apache.spark.sql.{DataFrame, SQLContext} | |
import org.apache.spark.sql.hive.HiveContext | |
val sqlCtx: SQLContext = new HiveContext(sc) | |
sqlCtx.sql("select 1") |