Skip to content

Instantly share code, notes, and snippets.

-- make session dataset samller to be able to try things fast
--create table session_tryouts as select * from classifier_data_sorted a where a.sessionId in (select distinct s.sessionId from classifier_data_sorted s limit 100);
drop table if exists classifier_data_label;
create table
classifier_data_label
as
select
sessionId,
(unix_timestamp(max(ts)) - unix_timestamp( min(ts))) as length,
spark-submit --class org.wikimedia.analytics.refinery.job.AppSessionMetrics --master yarn --num-executors=6 --executor-cores=2 --executor-memory=2g /mnt/hdfs/tmp/nuria/jars/refinery-job-0
.0.10-SNAPSHOT.jar hdfs://analytics-hadoop/tmp/mobile-apps-sessions 2015 03 10
@nuria
nuria / randomIndentifierTest.html
Last active August 23, 2018 21:38
Performance tests. RandomGenerator with 80 bits of entropy.
<html>
<head>
</head>
<body>
<script>
function generateRandomWith5_16Bits() {
var rnds, i,
@nuria
nuria / AppSessionMetrics.scala
Last active January 22, 2018 00:38
AppSessions That Runs On Spark Shell. See issue with quantiles: https://github.com/twitter/algebird/issues/517
import com.github.nscala_time.time.Imports.{LocalDate, Period}
import com.twitter.algebird.{QTree, QTreeSemigroup}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import scopt.OptionParser
import scala.collection.immutable.HashMap
/**
@nuria
nuria / gist:ec7fdf8ce38ac622b7eda8a8ed9758b6
Created December 5, 2017 21:20
Hive count query with group by
use wmf;
with hits as (
SELECT
geocoded_data['country_code'] as country,
geocoded_data['country'] country_name,
SUM(CASE WHEN hostname NOT LIKE 'cp3%' AND hostname NOT LIKE 'amssq%' THEN 1 ELSE 0 END) AS hits_from_this_country_not_through_amsterdam,
SUM(CASE WHEN hostname LIKE 'cp3%' OR hostname LIKE 'amssq%' THEN 1 ELSE 0 END) AS hits_from_this_country_from_amsterdam
FROM wmf.webrequest
WHERE TRUE
@nuria
nuria / Avro schema for PageContentSaveComplete
Last active August 11, 2017 20:02
Create avro table on top of avro files
{
"type" : "record",
"name" : "AutoGeneratedSchema",
"doc" : "Sqoop import of QueryResult",
"fields" : [ {
"name" : "id",
"type" : [ "null", "int" ],
"default" : null,
"columnName" : "id",
"sqlType" : "4"
@nuria
nuria / TestRuntimeInitializationOfChain.java
Last active May 11, 2017 09:50
Snippet to get classes belonging to a package at runtime that have a certain annotation
package org.wikimedia.analytics.refinery.tag;
import com.google.common.collect.ImmutableSet;
import com.google.common.reflect.ClassPath;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.reflections.*;
@nuria
nuria / gist:3ca0f0f5ec70eb606b4800284ff151fd
Created May 9, 2017 09:05
Daily Unique Devices Variation per number of uniques
#!/usr/local/bin/python
# unique devices variation study using daily data
# we account for weekly variations
# and try to see when the number of uniques
# variates too much to be a quality meassurement
# see: https://wikitech.wikimedia.org/w/index.php?title=Analytics/Data_Lake/Traffic/Unique_Devices/Last_access_solution
from operator import itemgetter
from datetime import datetime
from datetime import date
import numpy
@nuria
nuria / gist:9d4daaac2e910d02917e0fe740159ef1
Created November 2, 2016 19:44
hive group and count example
SELECT
month,
day,
SUM(CASE WHEN (user_agent LIKE '%iPhone%') THEN 1 ELSE 0 END) AS iphone,
SUM(CASE WHEN (user_agent LIKE '%iOS%') THEN 1 ELSE 0 END) AS iOS
FROM wmf.webrequest
WHERE webrequest_source = 'text'
AND year = 2016
AND month IN (9, 10)
AND (user_agent like '%iOS%' OR user_agent like '%iPhone%')
@nuria
nuria / metric_logster.js
Created October 28, 2016 20:41
JSON object parser. Each leaf node of the object will be keyed by a concatenated key made up of all parent keys
/**
Parses a Json Object.
The object will be traversed, and each leaf node of the object will
be keyed by a concatenated key made up of all parent keys.
**/
function MetricLogster(reporter) {
}