Skip to content

Instantly share code, notes, and snippets.

View ottomata's full-sized avatar

Andrew Otto ottomata

View GitHub Profile
@ottomata
ottomata / Dockerfile
Last active December 13, 2021 17:21
Build a packed conda env .tgz from conda environment.yml and/or pip requirements.txt
# See also:_https://pythonspeed.com/articles/conda-docker-image-size/
# The conda env build stage image:
# 1. Create conda env
# 2. Optionally install conda dependencies
# 3. Optionally install pip dependencies
FROM continuumio/miniconda3 AS conda_build_env
# Create a bare conda env.
# We need at minimum python, pip, conda-pack and gcc and g++ (to build any binary pip packages later)
@ottomata
ottomata / flink_wmf_event_utils.scala
Created April 12, 2021 14:08
Flink Wikimedia Event Utilities Integration
# If /etc/hadoop/conf exists, use it as HADOOP_CONF_DIR
if [ -z "${HADOOP_CONF_DIR}" -a -e "/etc/hadoop/conf" ]; then
export HADOOP_CONF_DIR=/etc/hadoop/conf
fi
# If hadoop command is execultable, then use it to add Hadoop jars to Spark's runtime classpath.
# See: https://spark.apache.org/docs/2.4.4/hadoop-provided.html
if [ -x "$(command -v hadoop)" ]; then
SPARK_DIST_CLASSPATH=$(hadoop classpath)
fi
Done:
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown -R analytics:analytics-privatedata-users /wmf/data/learning && \
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod -R o-rwx /wmf/data/learning
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown -R analytics:analytics-privatedata-users /wmf/data/archive/{browser,clickstream,domain_abbrev_map,eventlogging,geoeditors,geoip,geowiki_legacy,mediacounts,mediawiki,mobile_apps,pagecounts-all-sites,pagecounts-raw,pageview,projectcounts-all-sites,projectcounts-raw,projectview,unique_devices,user,webrequest} && \
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod -R o-rwx /wmf/data/archive/{browser,clickstream,domain_abbrev_map,eventlogging,geoeditors,geoip,geowiki_legacy,mediacounts,mediawiki,mobile_apps,pagecounts-all-sites,pagecounts-raw,pageview,projectcounts-all-sites,projectcounts-raw,projectview,unique_devices,user,webrequest}
---
TODO:
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown -R analytics:analytics-privatedata-users /wmf/da
/**
* Retries calling the provided async fn up to retryLimit times.
*
* @param {Function} fn
* @param {integer} retryLimit
* Default: 1
* @param {string} customRetryWarnMessage
* If set, this message will be used in the warning log message on errors
* caught before retry limit is reached.
* @param {Object} logger
#!/bin/bash
options=$(getopt -o s:c:p:b --long service:,cluster:,since:,pod:,bunyan -- "$@")
[ $? -eq 0 ] || {
echo "Incorrect options provided"
exit 1
}
eval set -- "$options"
while true; do
// sudo -u analytics kerberos-run-command spark2-shell --files /etc/hive/conf/hive-site.xml,/etc/refinery/refine/refine_eventlogging_analytics.properties,/srv/deployment/analytics/refinery/artifacts/hive-jdbc-1.1.0-cdh5.10.0.jar,/srv/deployment/analytics/refinery/artifacts/hive-service-1.1.0-cdh5.10.0.jar --master yarn --deploy-mode client --jars /srv/deployment/analytics/refinery/artifacts/refinery-job.jar --driver-java-options='-Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080'
import org.wikimedia.analytics.refinery.job.refine._
import org.wikimedia.eventutilities.core.event.{EventSchemaLoader, EventLoggingSchemaLoader}
import org.wikimedia.analytics.refinery.spark.sql.PartitionedDataFrame
import com.github.nscala_time.time.Imports._
import scala.util.matching.Regex
import org.apache.hadoop.fs.{FileSystem, Path}
import org.wikimedia.analytics.refinery.spark.sql._
import org.apache.spark.sql.types._
# Declare a class called test_class that takes a $content parameter.
class test_class(
$content
) {
file { "/tmp/f1.txt":
content => $content
}
}
# 'include' the test_class with content defined as "YOOHOOOO"
# From the build directory
$ ls anaconda-wmf/lib/libpython3.* | cat
anaconda-wmf/lib/libpython3.7m.a
anaconda-wmf/lib/libpython3.7m.nolto.a
anaconda-wmf/lib/libpython3.7m.so
anaconda-wmf/lib/libpython3.7m.so.1.0
anaconda-wmf/lib/libpython3.so
# From the built .deb
$ dpkg-deb -c /var/cache/pbuilder/result/buster-amd64/anaconda-wmf_2020.02~wmf1_amd64.deb | grep lib/libpython3
{
"meta": {
"id": "dc5507a1-ac79-464d-ae60-cd327e5570c3",
"dt": "2020-07-06T18:11:19.252Z",
"uri": "unknown",
"domain": "www.wikidata.org",
"request_id": "4338bff5-cad5-4fe6-ab67-69121ebc78b1",
"stream": "eventgate-analytics-external.error.validation"
},
"emitter_id": "eventgate-analytics-external-production",