ottomata’s gists

ottomata / scratch.js

Last active January 2, 2019 19:57

	'use strict';

	const bunyan = require('bunyan');
	const _ = require('lodash');

	const {
	resolveUri
	} = require('./lib/event-util');

	const EventValidator = require('./lib/EventValidator');

ottomata / getHivePartitionColumnNames.scala

Created December 20, 2018 17:38

	import scala.collection.JavaConversions._
	import org.apache.hadoop.hive.metastore.HiveConf
	import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
	import org.apache.spark.sql.SparkSession

	def getHivePartitionColumnNames(tableName: String): Seq[String] = {
	val spark: SparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
	val hiveConf = new HiveConf(spark.sparkContext.hadoopConfiguration, classOf[HiveConf])
	val metastoreClient = new HiveMetaStoreClient(hiveConf)

ottomata / gist:7651d0f008aa18dcd948ef3636424b23

Created November 14, 2018 20:33

	# on stat1007:
	export PYSPARK_DRIVER_PYTHON=jupyter
	export PYSPARK_DRIVER_PYTHON_OPTS="notebook --port 8123 --ip='localhost' --no-browser"
	pyspark2 --master yarn --executor-memory 8G --executor-cores 2 --driver-memory 8G --conf spark.driver.maxResultSize=8G

	Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
	[I 20:31:13.066 NotebookApp] Serving notebooks from local directory: /srv/home/bmansurov
	[I 20:31:13.066 NotebookApp] 0 active kernels
	[I 20:31:13.066 NotebookApp] The Jupyter Notebook is running at:
	[I 20:31:13.066 NotebookApp] http://localhost:8123/?token=306e5151ddea927b0d2acbee8bac4158f685cfd961a8e23d

ottomata / gist:7c4845a57f4ac522ffbf38501c2dce81

Created October 16, 2018 16:59

	In {T204177} we received a buncha new hardware. We'd like to use 5 of these as worker nodes as part of a new and separate Hadoop cluster that will run Presto. This cluster will be used to host public data that will be queryable from Cloud VPS networks.

	This Hadoop cluster will be called the 'cloud-analytics'. It's Hadoop 'cluster name' in Hadoop configs will be 'cloud-analytics-eqiad', to match the naming convention we have been using for other clusters, e.g. Kafka, Zookeeper, Druid, etc.

	The cloud-analytics nodes will also run other softwares (Hive, Presto, etc.). The workers will run on bare metal, but the masters can run on ganati VPS instances. I'll file a separate task for the ganeti instances.

	This task is to rack and set up 5 worker nodes. They should be named ca-worker100[1-5] (cloud-analytics).

	These nodes should be in the Analytics VLAN.

ottomata / webrequest_tag_dataframe_to_hive_pseudocode.scala

Created October 5, 2018 17:31

	val webrequest = spark.table("wmf.webrequest").where(...)

	// tags is an array right? need to concat these somehow, but you get the point.
	// Need just distinct list of tags
	val tags = webrequest.select("tags").distinct

	val tagGroupsDfs = tags.map(tag => (tag, webrequest.where(s"${tag} in tags")) )

	val partDfs = tagGroupDfs.map(t => {
	tag = t._1

ottomata / spark-amm

Last active September 21, 2018 18:01

spark-shell with ammonite and default spark config loading

	#!/usr/bin/env bash

	export SPARK_HOME="${SPARK_HOME:-/usr/lib/spark2}"
	export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}"/conf}"

	source ${SPARK_HOME}/bin/load-spark-env.sh
	export HIVE_CONF_DIR=${SPARK_CONF_DIR}

	AMMONITE=./amm
	SPARK_PREDEF=~/spark.predef.scala

ottomata / domain_counts.scala

Last active September 14, 2018 18:56

	// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar

	import org.apache.spark.sql.streaming._
	import org.apache.spark.sql.streaming.StreamingQueryListener._
	import org.apache.spark.sql._
	import org.apache.spark.sql.types._
	import org.apache.spark.sql.functions.from_json


	// Subscribe to eventlogging-valid-mixed using Spark structured streaming

ottomata / count_eventlogging-valid-mixed_schemas.scala

Last active November 3, 2021 13:13

Spark Structured Streaming example - word count in JSON field in Kafka

	// spark2-shell --jars /home/otto/kafka-clients-1.1.1.jar,/home/otto/spark-sql-kafka-0-10_2.11-2.3.1.jar

	import org.apache.spark.sql.streaming._
	import org.apache.spark.sql.streaming.StreamingQueryListener._
	import org.apache.spark.sql._
	import org.apache.spark.sql.types._
	import org.apache.spark.sql.functions.from_json


	// Subscribe to eventlogging-valid-mixed using Spark structured streaming

ottomata / gist:188f8b81f07e8e4eeb28476a1c845c6e

Created July 5, 2018 20:02

	eventlogging_AdvancedSearchRequest
	eventlogging_CentralAuth
	eventlogging_CentralNoticeImpression
	eventlogging_ChangesListClickTracking
	eventlogging_ChangesListFilterGrouping
	eventlogging_ContentTranslation
	eventlogging_ContentTranslationCTA
	eventlogging_EchoInteraction
	eventlogging_EchoMail
	eventlogging_EditConflict

ottomata / toree_pyspark_yarn.kernel.json

Created June 26, 2018 21:15

	{
	"metadata": {},
	"display_name": "Spark YARN - PySpark",
	"interrupt_mode": "signal",
	"language": "python",
	"env": {
	"PYTHONPATH": "/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python",
	"__TOREE_SPARK_OPTS__": "--master=yarn --conf spark.executorEnv.PYTHONPATH=/usr/lib/spark2/python/lib/py4j-0.10.6-src.zip:/usr/lib/spark2/python",
	"DEFAULT_INTERPRETER": "PySpark",
	"SPARK_HOME": "/usr/lib/spark2",

Andrew Otto ottomata