Skip to content

Instantly share code, notes, and snippets.

View ottomata's full-sized avatar

Andrew Otto ottomata

View GitHub Profile
# Note: This file is managed by Puppet.
# Hue configuration file
# ===================================
#
# For complete documentation about the contents of this file, run
# $ <hue_root>/build/env/bin/hue config_help
#
# All .ini files under the current directory are treated equally. Their
# contents are merged to form the Hue configuration, which can
{
"rev_timestamp": "2017-06-06T20:18:06Z",
"rev_sha1": "fy9n1sdjob23lhe51hspnnvqmicrgm8",
"rev_parent_id": 137977799,
"rev_minor_edit": false,
"rev_len": 14267,
"rev_id": 137981314,
"rev_content_model": "wikitext",
"rev_content_format": "wikitext",
"comment": "",
set session sql_log_bin=0;
-- Dropping EventLogging tables with no events more recent than 90 days ago (since )...
DROP TABLE `BannerImpression_5329872`;
DROP TABLE `CentralNoticeBannerHistory_13447710`;
DROP TABLE `ChangesListHighlights_16449602`;
DROP TABLE `ChangesListHighlights_16484288`;
DROP TABLE `CommandInvocation_15237653`;
DROP TABLE `CompletionSuggestions_13424343`;
DROP TABLE `CompletionSuggestions_13630018`;
DROP TABLE `ContentTranslationCTA_11616099`;
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "banner_activity_minutely",
"metricsSpec" : [
{
"name" : "request_count",
"type" : "longSum",
// // TODO: instead of the following, what if we took the original outputDf, and
// // added null cast as type fields not present in outputDf
// // to the outputDfs schema. Then we shouldn't need to re-read the data with the
// val table = hiveContext.table(tableName)
//
// // THis is kinda working? Maybe we need need
//// val emptyTableDf = table.where("1=0")
//// val finalDf = emptyTableDf.unionAll(outputDf)
//
// // make sure the output df has table's fields
object CoolStuff {
implicit class StringImplicits(s: String) {
def withCool(s2: String): String = s + " COOL " + s2
}
implicit class SeqStringImplicits(strings: Seq[String]) {
def joinEm(sep: String = " | "): String = {
// error: not found: value withCool
strings.map(withCool).mkString(sep)
// But this will work:
whitelist='(DiacriticsPoll|DiacriticsVisibility|Echo|EchoInteraction|Echo|EchoMail|Echo|Edit|EditorActivation|FlowReplies|GatherClicks|GatherFlags|GettingStartedNavbarNoArticle|GettingStartedOnRedirect|GettingStartedRedirectImpression|GuidedTourButtonClick|GuidedTour|GuidedTourExited|GuidedTourExternalLinkActivation|GuidedTourGuiderHidden|GuidedTourGuiderImpression|GuidedTour|GuidedTourInternalLinkActivation|GuidedTour|MediaWikiInstallPingback|MobileOptionsTracking|MobileWebBrowse|MobileWebClickTracking|MobileWebCta|MobileWebDiffClickTracking|MobileWebEditing|MobileWebInfobox|MobileWebLanguageSwitcher|MobileWebMainMenuClickTracking|MobileWebSearch|MobileWebSectionUsage|MobileWebUIClickTracking|MobileWebUploads|MobileWebWatching|MobileWebWatchlistClickTracking|MobileWebWikiGrok|MobileWebWikiGrokError|MobileWebWikiGrok|MobileWebWikiGrokResponse|MobileWebWikiGrok|MobileWikiAppAppearanceSettings|MobileWikiAppArticleSuggestions|MobileWikiAppCreateAccount|MobileWikiAppDailyStats|MobileWikiAppEdit|MobileWikiAppInsta
from sseclient import SSEClient
import json
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
for event in SSEClient(url):
if event.event == 'message' and event.data:
change = json.loads(event.data)
print('%(user)s edited %(title)s' % change)
elif event.event == 'error':
print('--- Encountered error', event.data)
# Where to output varnish log lines:
# kafka - (default) send to kafka broker
# stdout - just print to stdout (behave like varnishncsa)
# null - (test) collect all tags specified by format but dont output anything
output = stdout
# Log formatter
format.type = json
format = %{fake_tag0@hostname?fake-hostname-change-me-if-you-want}x %{@sequence!num?0}n %{end:%FT%T@dt}t %{Varnish:time_firstbyte@time_firstbyte!num?0.0}x %{X-Client-IP@ip}o %{X-Cache-Status@cache_status}o %{@http_status}s %{@response_size!num?0}b %{@http_method}m %{Host@uri_host}i %{@uri_path}U %{@uri_query}q %{Content-Type@content_type}o %{Referer@referer}i %{X-Forwarded-For@x_forwarded_for}i %{User-Agent@user_agent}i %{Accept-Language@accept_language}i %{X-Analytics@x_analytics}o %{Range@range}i %{X-Cache@x_cache}o
package org.wikimedia.analytics.refinery.job
import com.github.nscala_time.time.Imports.{LocalDate, Period}
import com.twitter.algebird.{QTree, QTreeSemigroup}
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import scopt.OptionParser