Skip to content

Instantly share code, notes, and snippets.

View enachb's full-sized avatar

Erich Nachbar enachb

  • Stuffy Lab
  • San Francisco
View GitHub Profile
@enachb
enachb / use_distributed_cache_in_cascading.java
Created September 3, 2009 00:04
fixed error returning HDFS instead of local path
// sticking it in as part of the hadoop job
FileSystem fs = FileSystem.get(jobConf);
fs.mkdirs(new Path("/discovery/UrlDb"));
fs.copyFromLocalFile(new Path(args[3] + "/gsb.blacklist"), new Path("/discovery/UrlDb"));
fs.copyFromLocalFile(new Path(args[3] + "/gsb.malwarelist"), new Path("/discovery/UrlDb"));
DistributedCache.addCacheFile(new URI("/discovery/UrlDb/gsb.blacklist"), jobConf);
DistributedCache.addCacheFile(new URI("/discovery/UrlDb/gsb.malwarelist"), jobConf);
// Tell the RecordArchiver about our entries, so we can archive them later.
RecordArchive ra = new RecordArchive(fs, inUrlsHDFS + "/../archive/incomingUrls");
ra.addJobPath(new Path(inUrlsHDFS));
Tap[] sourceFiles = new Tap[ra.getJobFiles().length];
int i = 0;
// Add only those files to the job, so we can accept more, while the job is running
for (FileStatus f : ra.getJobFiles()) {
LOG.info("Adding input files: " + f.getPath().getName());
sourceFiles[i++] = new Hfs(new FlatfileSequenceFile(new Fields("key", "value")), f.getPath().toString());
// sourceFiles[i++] = new Hfs(new TextLine(new Fields("value")), f.getPath().toString());
package com.bebo.hadoop.cascading.phishing;
import java.io.IOException;
import java.net.URISyntaxException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Vector;
import com.sun.istack.internal.Builder;
def writer = new StringWriter()
writer.append '<?xml version="1.0" encoding="UTF-8"?>'
def builder = new groovy.xml.MarkupBuilder(writer)
builder.urlset(xmlns:"http://www.sitemaps.org/schemas/sitemap/0.9") {
url(){
loc("http://sguys.com/MyManualLink1")
changefreq("weekkly")
public class SiteMap {
static void main(String[] args) {
def binding = new Binding();
def products = [new Product(name:"ProdName1",productId:"P1000"), new Product(name:"ProdName2",productId:"P1001")]
binding.setVariable "products", products
def shell = new GroovyShell(binding)
String siteMap = shell.evaluate(new File('grails-app/conf/SiteMapConfig.groovy'))
#export GRAILS_VERSION="$(ls -lhr $HOME/.grails | egrep -i '1\.' | head -1 | gawk '{print $9 }')"
export GRAILS_VERSION=`cat $GRAILS_HOME/build.properties | grep "^grails.version=" | awk -F= '{ print $2 }' | tr -d '\r' | tr -d '\n'`
_get_domain_classes(){
find ./grails-app/domain -iname *.groovy 2> /dev/null | tr \\n ' ' | sed 's/\.groovy//g' | sed 's/\.\/grails-app\/domain\///g' | tr '/' \.
}
_get_tests(){
find ./test -iname *.groovy 2> /dev/null | sed 's/\.\/test\/integration\///g' | sed 's/\Tests.groovy//g' | tr '/' \.
}
@enachb
enachb / jsonFilter.py
Created January 16, 2013 06:57
read input files from command line and filter json
import sys,json,fileinput
for line in fileinput.input():
obj = json.loads(line)
if obj["pageId"] == "AmericanIdol":
print obj["utc"]
@enachb
enachb / json_prefix.py
Created January 16, 2013 08:48
usage: python ~/json_utc_prefix.py part-* | sort -nr -k 1,1 | less aka show me all the latest posts
import sys,json,fileinput
for line in fileinput.input():
obj = json.loads(line)
# if obj["pageId"] == "AmericanIdol":
print "%s %s" % (obj["utc"],obj["pageId"])
import sys,json,fileinput
for line in fileinput.input():
obj = json.loads(line)
# if obj["pageId"] == "AmericanIdol":
print "%s %s" % (obj["pageUrl"], obj["time"])
----------------------------------------------------------------
---
- hosts: proxmox
user: root
tasks:
- name: Add backports repo
action: lineinfile dest='/etc/apt/sources.list.d/squeeze-backports.list' regexp='deb http://backports.debian.org/debian-backports squeeze-backports main' line='deb http://backports.debian.org/debian-backports squeeze-backports main' state=present create=true
- name: Install collectd 5.1
action: apt pkg=collectd-utils=5.1.0-3~bpo60+2 update-cache=yes state=installed default-release=squeeze-backports
- name: Copy collectd.conf over
action: copy src=qf/files/collectd.conf dest=/etc/collectd/collectd.conf owner=root group=root mode=644 backup=yes