This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const bunyan = require('bunyan'); | |
const _ = require('lodash'); | |
const { | |
resolveUri | |
} = require('./lib/event-util'); | |
const EventValidator = require('./lib/EventValidator'); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.JavaConversions._ | |
import org.apache.hadoop.hive.metastore.HiveConf | |
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient | |
import org.apache.spark.sql.SparkSession | |
def getHivePartitionColumnNames(tableName: String): Seq[String] = { | |
val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate() | |
val hiveConf = new HiveConf(spark.sparkContext.hadoopConfiguration, classOf[HiveConf]) | |
val metastoreClient = new HiveMetaStoreClient(hiveConf) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# on stat1007: | |
export PYSPARK_DRIVER_PYTHON=jupyter | |
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --port 8123 --ip='localhost' --no-browser" | |
pyspark2 --master yarn --executor-memory 8G --executor-cores 2 --driver-memory 8G --conf spark.driver.maxResultSize=8G | |
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8 | |
[I 20:31:13.066 NotebookApp] Serving notebooks from local directory: /srv/home/bmansurov | |
[I 20:31:13.066 NotebookApp] 0 active kernels | |
[I 20:31:13.066 NotebookApp] The Jupyter Notebook is running at: | |
[I 20:31:13.066 NotebookApp] http://localhost:8123/?token=306e5151ddea927b0d2acbee8bac4158f685cfd961a8e23d |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
In {T204177} we received a buncha new hardware. We'd like to use 5 of these as worker nodes as part of a new and separate Hadoop cluster that will run Presto. This cluster will be used to host public data that will be queryable from Cloud VPS networks. | |
This Hadoop cluster will be called the 'cloud-analytics'. It's Hadoop 'cluster name' in Hadoop configs will be 'cloud-analytics-eqiad', to match the naming convention we have been using for other clusters, e.g. Kafka, Zookeeper, Druid, etc. | |
The cloud-analytics nodes will also run other softwares (Hive, Presto, etc.). The workers will run on bare metal, but the masters can run on ganati VPS instances. I'll file a separate task for the ganeti instances. | |
This task is to rack and set up 5 worker nodes. They should be named ca-worker100[1-5] (cloud-analytics). | |
**These nodes should be in the Analytics VLAN**. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val webrequest = spark.table("wmf.webrequest").where(...) | |
// tags is an array right? need to concat these somehow, but you get the point. | |
// Need just distinct list of tags | |
val tags = webrequest.select("tags").distinct | |
val tagGroupsDfs = tags.map(tag => (tag, webrequest.where(s"${tag} in tags")) ) | |
val partDfs = tagGroupDfs.map(t => { | |
tag = t._1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
export SPARK_HOME="${SPARK_HOME:-/usr/lib/spark2}" | |
export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}"/conf}" | |
source ${SPARK_HOME}/bin/load-spark-env.sh | |
export HIVE_CONF_DIR=${SPARK_CONF_DIR} | |
AMMONITE=./amm | |
SPARK_PREDEF=~/spark.predef.scala |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar | |
import org.apache.spark.sql.streaming._ | |
import org.apache.spark.sql.streaming.StreamingQueryListener._ | |
import org.apache.spark.sql._ | |
import org.apache.spark.sql.types._ | |
import org.apache.spark.sql.functions.from_json | |
// Subscribe to eventlogging-valid-mixed using Spark structured streaming |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar | |
import org.apache.spark.sql.streaming._ | |
import org.apache.spark.sql.streaming.StreamingQueryListener._ | |
import org.apache.spark.sql._ | |
import org.apache.spark.sql.types._ | |
import org.apache.spark.sql.functions.from_json | |
// Subscribe to eventlogging-valid-mixed using Spark structured streaming |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
eventlogging_AdvancedSearchRequest | |
eventlogging_CentralAuth | |
eventlogging_CentralNoticeImpression | |
eventlogging_ChangesListClickTracking | |
eventlogging_ChangesListFilterGrouping | |
eventlogging_ContentTranslation | |
eventlogging_ContentTranslationCTA | |
eventlogging_EchoInteraction | |
eventlogging_EchoMail | |
eventlogging_EditConflict |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": {}, | |
"display_name": "Spark YARN - PySpark", | |
"interrupt_mode": "signal", | |
"language": "python", | |
"env": { | |
"PYTHONPATH": "/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python", | |
"__TOREE_SPARK_OPTS__": "--master=yarn --conf spark.executorEnv.PYTHONPATH=/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python", | |
"DEFAULT_INTERPRETER": "PySpark", | |
"SPARK_HOME": "/usr/lib/spark2", |