Skip to content

Instantly share code, notes, and snippets.

View natbusa's full-sized avatar

Nate Busa natbusa

View GitHub Profile
@natbusa
natbusa / mapper.py
Last active November 2, 2024 11:12
word count: mapper and reducer in python using hadoop streaming
#!/usr/bin/env python
import sys
# input comes from STDIN (standard input)
for line in sys.stdin:
#clean and split in words
linechars = [c for c in line.lower() if c.isalpha() or c==' ']
words = ''.join(linechars).strip().split()
#emit the key-balue pairs
@natbusa
natbusa / wc.cascading.java
Created March 22, 2014 11:00
Word count in Haddop cascading
class ScrubFunction extends BaseOperation implements Function
{
public ScrubFunction( Fields fieldDeclaration )
{
super( 1, fieldDeclaration );
}
public void operate( FlowProcess flowProcess, FunctionCall functionCall )
{
TupleEntry argument = functionCall.getArguments();
@natbusa
natbusa / beam2hdfs.sh
Created March 24, 2014 13:08
Beam up the lorem.txt to hdfs
#!/bin/sh
#check if the directory exists on hdfs
$HADOOP_HOME/bin/hadoop fs -ls wordcount-input
if [ $? -ne 0 ]
then $HADOOP_HOME/bin/hadoop fs -mkdir wordcount-input/
fi
#check if the lorem.txt exists on hdfs
$HADOOP_HOME/bin/hadoop fs -ls wordcount-input/lorem.txt
@natbusa
natbusa / wc.pig
Last active August 29, 2015 13:57
Word count: in pig
A = load 'wordcount-input/lorem.txt' as (line:chararray);
B = foreach A generate FLATTEN(TOKENIZE(line)) as word;
C = foreach B generate LOWER(REPLACE(word,'\\W+','')) as word;
D = group C by word;
E = foreach D generate group, COUNT(C);
store E into 'wordcount-pig-output';
@natbusa
natbusa / main.scala
Last active August 29, 2015 13:57
Word count inscalding
class WordCount(args : Args) extends Job(args) {
TextLine(args("input"))
.read
.flatMap('line -> 'word){ line : String => line.split("\\s")}
.groupBy('word){group => group.size}
.write(Tsv(args("output")))
}
@natbusa
natbusa / actor.scala
Created May 3, 2014 22:43
Simple example of web api with statistical RPC in R and persistance in cassandra
class AnalyticsActor extends Actor {
def actorRefFactory = context
val dataActor = actorRefFactory.actorOf(Props[NoSqlActor], "cassandra-client")
val statActor = actorRefFactory.actorOf(Props[StatActor], "statistical-engine")
def receive = {
case (a: String, c: String, ctx: RequestContext) =>
val f:Future[Result] =
@natbusa
natbusa / flask.api.py
Last active August 29, 2015 14:02
wikipedia live search demo
@app.route('/word/<keyword>')
def fetch_word(keyword):
db = get_cassandra()
pages = []
results = db.fetchWordResults(keyword)
for hit in results:
pages.append(db.fetchPageDetails(hit["url"]))
return Response(json.dumps(pages), status=200,
@natbusa
natbusa / all.js
Last active August 29, 2015 14:02
Get all you can from the browser
var all = {
//screen
'screen.width' : screen.width,
'screen.height' : screen.height,
'screen.availWidth' : screen.availWidth,
'screen.availHeight' : screen.availHeight,
'screen.colorDepth' : screen.colorDepth,
'screen.pixelDepth' : screen.pixelDepth,
//location
'location.href' : location.href,
@natbusa
natbusa / linux pipes
Created March 19, 2015 10:13
scaling streaming computing
$> cat data.txt | grep "streming is awesome" > results.txt
@natbusa
natbusa / flow.py
Created April 16, 2015 10:45
actor httpclient syntax/semantics ideas
from httpMethods import *
# Create the graph (profiling tags)
# get (as a http client) every 10 seconds json and emit it on
post('/api/actors',
{
"type":"httpclient",
"trigger": null, # can also be omitted altogether
"collect":null, # can also be omitted altogether