Skip to content

Instantly share code, notes, and snippets.

View erikbern's full-sized avatar

Erik Bernhardsson erikbern

View GitHub Profile
import subprocess, itertools, numpy
import matplotlib.pyplot as plt
command = 'git log --shortstat --log-size --format=oneline --no-merges'.split()
data = subprocess.check_output(command).split('\n')
def read_groups():
buf = []
for line in data:
buf.append(line)
if (indices.size() <= (size_t)_K) {
for (size_t i = 0; i < indices.size(); i++)
m->children[i] = indices[i];
}
def tabCounter() = {
implicit def input = getInput()
input.map(_.split('\t').size).reduce(_ + _)
}
val task = LuigiTask().requires(MyTsvJob(buildId)).output(HdfsTarget("output")).do(tabCounter)
val otherTask = LuigiTask().requires(task).output(HdfsTarget("output-2")).do(somethingElse)
otherTask.run() // schedule task and otherTask
@erikbern
erikbern / gist:fc05e8cccd64dccde630
Last active August 29, 2015 14:03
Generate Dirichlet distribution
import random, time
import pylab, numpy
def method1(n):
s = 1.0
r = []
for i in xrange(n):
t = s * (1 - random.random() ** (1.0 / (n - i)))
s -= t
r.append(t)
import os, shutil
import luigi
import sparkey
import random
class SparkeyTarget(luigi.Target):
def __init__(self, path=None, spi='data.spi', spl='data.spl', writer_cls=sparkey.HashWriter, reader_cls=sparkey.HashReader):
self.path = path
self.spi_path = spi
self.spl_path = spl
import luigi
# Here we are importing our own tasks, provided they are
# arranged in a python module (folder) named "components"
from components.SomeTaskA import SomeTaskA
from components.SomeTaskB import SomeTaskB
from components.SomeTaskC import SomeTaskC
# ------------------------------------------
# DEFINE THE MAIN WORKFLOW DEPENDENCY GRAPH
class BsddbTarget(luigi.LocalTarget):
def open(self, mode):
return bsddb.hashopen(self.path, mode) # TODO: make this atomic!!
class TCTarget(luigi.LocalTarget):
open_tcs = {}
def open(self, mode):
if mode == 'r':
if self.path not in TCTarget.open_tcs: