Skip to content

Instantly share code, notes, and snippets.

View ottomata's full-sized avatar

Andrew Otto ottomata

View GitHub Profile
$ python cergen/main.py --generate --force --base-path ./certificates/certs --base-private-path ./certificates/private ./examples/example.certs.yaml
2017-10-19 20:18:50,389 INFO cergen Generating certificates ['self_signed_cert', 'root_ca', 'intermediate_ca', 'hostname1.example.org', 'hostname2.example.org'] with force=True
2017-10-19 20:18:50,389 INFO Certificate(self_signed_cert) Generating all files, force=True...
making private path /vagrant/srv/cergen/certificates/private/self_signed_cert
2017-10-19 20:18:50,480 INFO Certificate(self_signed_cert) Generating certficiate file
2017-10-19 20:18:50,501 INFO Certificate(self_signed_cert) Generating PKCS12 keystore file
2017-10-19 20:18:50,793 INFO Certificate(self_signed_cert) Generating Java keystore file
2017-10-19 20:18:51,267 INFO Certificate(root_ca) Generating all files, force=True...
making private path /vagrant/srv/cergen/certificates/private/root_ca
2017-10-19 20:18:51,318 INFO Certificate(root_ca) Gene
import org.apache.hadoop.fs.{FileSystem, Path, FileChecksum, ChecksumFileSystem, LocalFileSystem}
import java.net.URI
val hdfs = FileSystem.get(sc.hadoopConfiguration)
val localfs = FileSystem.get(new URI("file:///"), sc.hadoopConfiguration)
val lfs = new LocalFileSystem(localfs.asInstanceOf[ChecksumFileSystem])
# read pyspark --help if you need to run pyspark in YARN cluster mode for big data.
# By default it runs locally. You can still access Hive and HDFS with it.
$ pyspark
...
>>> from pyspark.sql import HiveContext
>>> sqlContext = HiveContext(sc)
// Files written with Spark 2.1
val input = "..."
val output = "hdfs://analytics-hadoop/tmp/otto/webrequest_sampled_1000_export_test"
val parquet = spark.read.parquet(input)
// Parquet
parquet.write.format("parquet").option("compression", "uncompressed").mode("overwrite").save(output + "/parquet")
parquet.write.format("parquet").option("compression", "snappy").mode("overwrite").save(output + "/parquet_snappy")
spark-submit \
--class org.wikimedia.analytics.refinery.job.JsonRefine \
./refinery-job/target/refinery-job-0.0.49-SNAPSHOT.jar \
--input-base-path /wmf/data/raw/eventlogging \
--database otto \
--output-base-path /user/otto/external/event02 \
--done-flag _REFINED \
--input-regex '.*eventlogging_(.+)/hourly/(\d+)/(\d+)/(\d+)/(\d+)' \
--input-capture 'table,year,month,day,hour' \
--table-whitelist '.*Popups' \
This file has been truncated, but you can view the full file.
$ sudo rsync -av --update stat1003.eqiad.wmnet::home/ /home/
receiving incremental file list
./
awight/
bearloga/
bearloga/.bash_history
bearloga/R/x86_64-pc-linux-gnu-library/
bearloga/R/x86_64-pc-linux-gnu-library/3.2/
bearloga/R/x86_64-pc-linux-gnu-library/3.2/Rcpp/
#!/bin/bash
###########################################################
# PUPPET MANAGED #
# Do not edit this file on a server node unless you #
# are willing to have your changes overwritten by #
# Puppet. If you really want to change the contents #
# of this file, change it in the puppet subversion #
# repository and check it out on the ops server. #
###########################################################
@ottomata
ottomata / BannerImpressionStream.scala
Last active August 10, 2017 20:20
Version of Jospeh's BannerImpressionStream that uses Tranquility to produce directly to Druid rather than Kafka. Doesn't work: https://phabricator.wikimedia.org/T168550#3517248
package org.wikimedia.analytics.refinery.job
import com.netaporter.uri.Uri
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.wikimedia.analytics.refinery.core.Webrequest
import scopt.OptionParser
import org.json4s._
// Support implicit conversion from regex string CLI opt to matching filter function.
implicit val scoptFilterRegexRead: scopt.Read[(String) => Boolean] = scopt.Read.reads { s =>
val regex = s.r
(string: String) => {
string match {
case regex(_*) => true
case _ => false
}
}
}
mysql> select rev_page, page_title, rev_comment from page, revision where rev_page in(54073829,54112112,54112566,54166124,54169722,54179829,54188344,54193238,54200374,54204127,54257308,54280581,54280697,54283421) and rev_parent_id = 0 and revision.rev_page = page.page_id;
+----------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| rev_page | page_title | rev_comment |
+----------+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------