Skip to content

Instantly share code, notes, and snippets.

View ottomata's full-sized avatar

Andrew Otto ottomata

View GitHub Profile
scala> t.printSchema
root
|-- dt: string (nullable = true)
|-- event: struct (nullable = true)
| |-- apiMode: long (nullable = true)
| |-- appInstallID: string (nullable = true)
| |-- fromBack: long (nullable = true)
| |-- fromDisambig: long (nullable = true)
| |-- fromExternal: long (nullable = true)
| |-- fromHistory: long (nullable = true)
/** Get the intersection of field names in new and old schema, newSchema is expected to be a superset */
def orderedSelect(newSchema: StructType, oldSchema: StructType): Seq[String] = {
fieldSelect(newSchema).intersect(fieldSelect(oldSchema))
}
/** build a list of field nameswith nested struct fields */
def fieldSelect(schema: StructType): Seq[String] = {
schema.flatMap { field =>
field.dataType match {
case struct: StructType => fieldSelect(struct).map(field.name + "." + _)
$ spark-submit --class org.wikimedia.analytics.refinery.job.JsonRefine /srv/deployment/analytics/refinery/artifacts/refinery-job.jar --help
JSON Datasets -> Partitioned Hive Parquet tables.
Given an input base path, this will search all subdirectories for input
partitions to convert to Parquet backed Hive tables. This was originally
written to work with JSON data imported via Camus into hourly buckets, but
should be configurable to work with any regular import directory hierarchy.
/**
* Given a fully qualified String package.ObjectName and String method name, this
* Function will return a scala.reflect.runtime.universe.MethodMirror that can be
* used for calling the method on the object. Note that MethodMirror is not a direct
* reference to the actual method, and as such does not have compile time type
* and signature checking. You must ensure that you call the method with exactly the
* same arguments and types that the method expects, or you will get a runtime exception.
*
* @param objectName Fully qualified name for an object, e.g. org.wikimedia.analytics.refinery.core.DeduplicateEventLogging
* @param methodName Name of method in the object. Default "apply".
mediawiki_job_assembleuploadchunks
mediawiki_job_bouncehandlerjob
mediawiki_job_bouncehandlernotificationjob
mediawiki_job_categorymembershipchange
mediawiki_job_cdnpurge
mediawiki_job_changenotification
mediawiki_job_cirrussearchdeletepages
mediawiki_job_cirrussearchelasticawrite
mediawiki_job_cirrussearchincominglinkcount
mediawiki_job_cirrussearchlinksupdate
import org.apache.spark.sql.types._
val ua_schema = StructType(
StructField("os_minor", StringType, false) ::
StructField("is_bot", BooleanType, false) ::
StructField("os_major", StringType, false) ::
StructField("device_family", StringType, false) ::
StructField("os_family", StringType, false) ::
StructField("browser_family", StringType, false) ::
StructField("browser_minor", StringType, false) ::
{
"database": "enwiki",
"mediawiki_signature": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJtZXRhIjp7InVyaSI6Imh0dHBzOlwvXC9lbi53aWtpcGVkaWEub3JnXC93aWtpXC9Vc2VyX3RhbGs6UEFLSElHSFdBWSIsInRvcGljIjoibWVkaWF3aWtpLmpvYi5FY2hvTm90aWZpY2F0aW9uRGVsZXRlSm9iIiwicmVxdWVzdF9pZCI6IldmdFNrZ3BBSURvQUFESDNCMDhBQUFCQSIsImlkIjoiNWJjYmMwOTktYmZmMS0xMWU3LWJkNDctOTBiMTFjMjc5ZDI0IiwiZHQiOiIyMDE3LTExLTAyVDE3OjE1OjAwKzAwOjAwIiwiZG9tYWluIjoiZW4ud2lraXBlZGlhLm9yZyJ9LCJkYXRhYmFzZSI6ImVud2lraSIsInR5cGUiOiJFY2hvTm90aWZpY2F0aW9uRGVsZXRlSm9iIiwicGFnZV9uYW1lc3BhY2UiOjMsInBhZ2VfdGl0bGUiOiJVc2VyX3RhbGs6UEFLSElHSFdBWSIsInBhcmFtcyI6eyJ1c2VySWRzIjp7IjMyMjk2NTM2IjozMjI5NjUzNn0sInJlcXVlc3RJZCI6IldmdFNrZ3BBSURvQUFESDNCMDhBQUFCQSJ9fQ.YYv68_PGUMdokN8pDdJRqi51m9ecx7PfgWSOWZ-p1fA",
"meta": {
"domain": "en.wikipedia.org",
"dt": "2017-11-02T17:15:00+00:00",
"id": "5bcbc099-bff1-11e7-bd47-90b11c279d24",
"request_id": "WftSkgpAIDoAADH3B08AAABA",
"schema_uri": "mediawiki/job/1",
"topic": "mediawiki.job.EchoNotificationDeleteJob",
kafka_consumer_consumer_coordinator_metrics_assigned_partitions{client_id="kafka-mirror-k1_to_k2-0",} 2.0
kafka_consumer_consumer_coordinator_metrics_assigned_partitions{client_id="kafka-mirror-k1_to_k2-1",} 1.0
kafka_consumer_consumer_coordinator_metrics_commit_latency_avg{client_id="kafka-mirror-k1_to_k2-0",} 5.2727272727272725
kafka_consumer_consumer_coordinator_metrics_commit_latency_avg{client_id="kafka-mirror-k1_to_k2-1",} 3.75
kafka_consumer_consumer_coordinator_metrics_commit_latency_max{client_id="kafka-mirror-k1_to_k2-0",} 10.0
kafka_consumer_consumer_coordinator_metrics_commit_latency_max{client_id="kafka-mirror-k1_to_k2-1",} 12.0
kafka_consumer_consumer_coordinator_metrics_commit_rate{client_id="kafka-mirror-k1_to_k2-0",} 0.1967940460855875
kafka_consumer_consumer_coordinator_metrics_commit_rate{client_id="kafka-mirror-k1_to_k2-1",} 0.19450049840752717
kafka_consumer_consumer_coordinator_metrics_heartbeat_rate{client_id="kafka-mirror-k1_to_k2-0",} 0.3336113427856547
kafka_consumer_consumer_coordin
# TYPE kafka_consumer_consumer_node_metrics_outgoing_byte_rate untyped
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-0",node_id="node-2147483636",} 62.03594874294068
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-1",node_id="node-2147483636",} 70.35885843458604
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-0",node_id="node--1",} 0.0
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-0",node_id="node-11",} 190.15470082172757
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-1",node_id="node--1",} 0.0
kafka_consumer_consumer_node_metrics_outgoing_byte_rate{client_id="kafka-mirror-k1_to_k2-1",node_id="node-11",} 203.18866253321522
# HELP kafka_consumer_consumer_coordinator_metrics_commit_latency_max The max time taken for a commit request (kafka.consumer<type=consumer-coordinator-metrics, client-id=kafka-mirror
event.gif?%7B%22event%22%3A%7B%22pageTitleSource%22%3A%22Jeff%20Flake%22%2C%22namespaceIdSource%22%3A0%2C%22pageIdSource%22%3A407926%2C%22isAnon%22%3Atrue%2C%22popupEnabled%22%3Afalse%2C%22pageToken%22%3A%229647b77d3d7080ca%22%2C%22sessionToken%22%3A%22d5fc894a80aae522%22%2C%22previewCountBucket%22%3A%220%20previews%22%2C%22hovercardsSuppressedByGadget%22%3Afalse%2C%22totalInteractionTime%22%3A15%2C%22action%22%3A%22dwelledButAbandoned%22%2C%22linkInteractionToken%22%3A%224db858f0d8e9815a%22%2C%22pageTitleHover%22%3A%22Public_Policy_Polling%22%2C%22namespaceIdHover%22%3A0%7D%2C%22revision%22%3A16364296%2C%22schema%22%3A%22Popups%22%2C%22webHost%22%3A%22en.wikipedia.org%22%2C%22wiki%22%3A%22enwiki