Skip to content

Instantly share code, notes, and snippets.

import cascading.tuple.{Fields, TupleEntry}
import com.twitter.scalding._
import java.net.URLDecoder
import scala.util.matching.Regex
class BoomerangLogJob(args: Args) extends Job(args) {
val input = TextLine(args("input"))
val output = TextLine(args("output"))
val trap = Tsv(args("trap"))
Goal: Process the 12 million plus records
from: http://seer.cancer.gov/popdata/download.html
using: a Scala API atop Cascading, aka SCALDING ( Inventors: Avi Bryant, Oscar Boykin, Argyris )
to find:
THE FASTEST GROWING COUNTY IN THE UNITED STATES over the 1969-2011 timeframe.
-----------------------------------------------------------------------------
RESULTS: Scroll to the very bottom.
First, the scalding source...
---
import com.twitter.algebird.{Aggregator, Semigroup}
import com.twitter.scalding._
import scala.util.Random
/**
* This job is a tutorial of sorts for scalding's Execution[T] abstraction.
* It is a simple implementation of Lloyd's algorithm for k-means on 2D data.
*
* http://en.wikipedia.org/wiki/K-means_clustering
import org.parboiled.scala._
import org.parboiled.errors.{ErrorUtils, ParsingException}
case class Csv(header: List[String], body: List[List[String]])
object CsvParser extends Parser {
def inputLine = rule { csv ~ EOI ~~> Csv }
def csv = rule {
optional(header ~ CRLF) ~~> { _.flatten.toList } ~
@darkseed
darkseed / ip.scala
Last active August 29, 2015 14:13 — forked from timothyklim/ip.scala
// Alternative
"1.2.3.4".split("\\.").reverse.zipWithIndex.map(a=>a._1.toInt*math.pow(256,a._2).toLong).sum
(0 until 4).map(a=>16909060L / math.pow(256, a).floor.toInt % 256).reverse.mkString(".")
def ccToMap(cc: AnyRef) =
(Map[String, Any]() /: cc.getClass.getDeclaredFields) {
(a, f) =>
f.setAccessible(true)
a + (f.getName -> f.get(cc))
}
// Usage
case class Column(name: String,
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
object JsonUtil {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
def toJson(value: Map[Symbol, Any]): String = {
@darkseed
darkseed / fsm.py
Last active August 29, 2015 14:07 — forked from rca/fsm.py
"""
Finite State Machine
This FSM implementation is extracted from the django-fsm package and licensed
under the same BSD-like license at:
https://github.com/kmmbvnr/django-fsm/blob/master/LICENSE
Basic usage:
#!/bin/bash
#
# Initialize new virtual server using LXC and set up networking and HTTP proxy
#
# Written by: Deni Bertovic <[email protected]>
#
# Released into Public Domain. You may use, modify and distribute it as you
# see fit.
#
# This script will:
<source>
type tail
format /(?<c_ip>[\w\.]+):(?<c_port>\d+) \[(?<a_date>.+)\] (?<f_end>[\w-]+) (?<b_end>[\w-]+)\/(?<b_server>[\w-]+) (?<tw>\d+)\/(?<tc>\d
+)\/(?<tt>\d+) (?<bytes>\d+) (?<t_state>[\w-]+) (?<actconn>\d+)\/(?<feconn>\d+)\/(?<beconn>\d+)\/(?<srv_conn>\d+)\/(?<retries>\d+) (?<
srv_queue>\d+)\/(?<backend_queue>\d+)/
time_format %d/%B/%Y:%H:%M:%S
path /var/log/haproxy/haproxy_access.log
pos_file /opt/fluentd/var/pos/haproxy_access.pos
tag haproxy.access
</source>