Skip to content

Instantly share code, notes, and snippets.

View ottomata's full-sized avatar

Andrew Otto ottomata

View GitHub Profile
'use strict';
const bunyan = require('bunyan');
const _ = require('lodash');
const {
resolveUri
} = require('./lib/event-util');
const EventValidator = require('./lib/EventValidator');
import scala.collection.JavaConversions._
import org.apache.hadoop.hive.metastore.HiveConf
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
import org.apache.spark.sql.SparkSession
def getHivePartitionColumnNames(tableName: String): Seq[String] = {
val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
val hiveConf = new HiveConf(spark.sparkContext.hadoopConfiguration, classOf[HiveConf])
val metastoreClient = new HiveMetaStoreClient(hiveConf)
# on stat1007:
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --port 8123 --ip='localhost' --no-browser"
pyspark2 --master yarn --executor-memory 8G --executor-cores 2 --driver-memory 8G --conf spark.driver.maxResultSize=8G
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
[I 20:31:13.066 NotebookApp] Serving notebooks from local directory: /srv/home/bmansurov
[I 20:31:13.066 NotebookApp] 0 active kernels
[I 20:31:13.066 NotebookApp] The Jupyter Notebook is running at:
[I 20:31:13.066 NotebookApp] http://localhost:8123/?token=306e5151ddea927b0d2acbee8bac4158f685cfd961a8e23d
In {T204177} we received a buncha new hardware. We'd like to use 5 of these as worker nodes as part of a new and separate Hadoop cluster that will run Presto. This cluster will be used to host public data that will be queryable from Cloud VPS networks.
This Hadoop cluster will be called the 'cloud-analytics'. It's Hadoop 'cluster name' in Hadoop configs will be 'cloud-analytics-eqiad', to match the naming convention we have been using for other clusters, e.g. Kafka, Zookeeper, Druid, etc.
The cloud-analytics nodes will also run other softwares (Hive, Presto, etc.). The workers will run on bare metal, but the masters can run on ganati VPS instances. I'll file a separate task for the ganeti instances.
This task is to rack and set up 5 worker nodes. They should be named ca-worker100[1-5] (cloud-analytics).
**These nodes should be in the Analytics VLAN**.
val webrequest = spark.table("wmf.webrequest").where(...)
// tags is an array right? need to concat these somehow, but you get the point.
// Need just distinct list of tags
val tags = webrequest.select("tags").distinct
val tagGroupsDfs = tags.map(tag => (tag, webrequest.where(s"${tag} in tags")) )
val partDfs = tagGroupDfs.map(t => {
tag = t._1
@ottomata
ottomata / spark-amm
Last active September 21, 2018 18:01
spark-shell with ammonite and default spark config loading
#!/usr/bin/env bash
export SPARK_HOME="${SPARK_HOME:-/usr/lib/spark2}"
export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}"/conf}"
source ${SPARK_HOME}/bin/load-spark-env.sh
export HIVE_CONF_DIR=${SPARK_CONF_DIR}
AMMONITE=./amm
SPARK_PREDEF=~/spark.predef.scala
// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.StreamingQueryListener._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.from_json
// Subscribe to eventlogging-valid-mixed using Spark structured streaming
@ottomata
ottomata / count_eventlogging-valid-mixed_schemas.scala
Last active November 3, 2021 13:13
Spark Structured Streaming example - word count in JSON field in Kafka
// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.StreamingQueryListener._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.from_json
// Subscribe to eventlogging-valid-mixed using Spark structured streaming
eventlogging_AdvancedSearchRequest
eventlogging_CentralAuth
eventlogging_CentralNoticeImpression
eventlogging_ChangesListClickTracking
eventlogging_ChangesListFilterGrouping
eventlogging_ContentTranslation
eventlogging_ContentTranslationCTA
eventlogging_EchoInteraction
eventlogging_EchoMail
eventlogging_EditConflict
{
"metadata": {},
"display_name": "Spark YARN - PySpark",
"interrupt_mode": "signal",
"language": "python",
"env": {
"PYTHONPATH": "/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python",
"__TOREE_SPARK_OPTS__": "--master=yarn --conf spark.executorEnv.PYTHONPATH=/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python",
"DEFAULT_INTERPRETER": "PySpark",
"SPARK_HOME": "/usr/lib/spark2",