neilkod’s gists

neilkod / reducer3.py

Created July 30, 2010 22:10

	#!/usr/bin/python
	import sys
	from collections import defaultdict
	cnts = defaultdict(int)
	for line in sys.stdin:
	line = line.strip()
	k,v = line.split('\t')
	(item1,item2)=k.split(',')
	cnts[(int(item1),int(item2))] += 1

neilkod / reducer.py

Created August 3, 2010 21:45

	# reducer.py
	# input data looks like
	# (1,2) 1
	# (2,3) 1
	# (1,3) 1
	# (1,3) 1

	# need to count each group and sum the totals.

	#!/usr/bin/python

neilkod / fwordhashtags.pig

Created August 9, 2010 16:13

	register piggybank.jar
	raw = LOAD 'parsed/' USING PigStorage('\t') AS (id:chararray,timestamp:chararray,screenname:chararray,tweet:chararray);
	-- filter the tweets to only the ones that have a hashtag containing the f-word
	fltr = FILTER raw BY tweet matches '.\\#\\p{Alpha}[Ff][Uu][Cc][Kk].*?';
	--extract the actual regex-matched hashtag. note-there has to be a better way to do this
	extrctd = FOREACH fltr GENERATE FLATTEN(org.apache.pig.piggybank.evaluation.string.RegexExtract(tweet,'.(\\#\\p{Alpha}[Ff][Uu][Cc][Kk].*?\\b)',1)) as (tweet:chararray);
	--convert to lowercase, group, and sort.
	lowrd = FOREACH extrctd GENERATE FLATTEN(org.apache.pig.piggybank.evaluation.string.LOWER(tweet));
	grpd = GROUP lowrd by $0;
	cntd = FOREACH grpd GENERATE $0 as theHour,COUNT(lowrd) as cnt;

neilkod / overlap.py

Created August 10, 2010 16:08

	192:commonTweeps nkodner$ ./overlap.py
	mrflip has 881 friends and 1612 followers
	thedatachef has 26 friends and 78 followers

	--------

	mrflip and thedatachef have 21 friends in common:
	[u'wattsteve', u'dhruvbansal', u'flowingdata', u'kevinweil', u'peteskomoroch', u'squarecog', u'jessecrouch', u'neilkod', u'dataspora', u'nickducoff', u'nicktj', u'jeromatron', u'doncarlo', u'habcous', u'mat_kelcey', u'sarahnordquist', u'mndoci', u'josephkelly', u'fricanosdeli', u'steveodom', u'infochimps']

	--------

neilkod / gist:518286

Created August 11, 2010 00:38

	192:commonTweeps nkodner$ time ./overlap.py
	drewconway has 171 friends and 1141 followers
	datajunkie has 381 friends and 881 followers

	--------

	drewconway and datajunkie have 35 friends in common:
	[u'lewisshepherd', u'mrflip', u'gappy3000', u'hamiltonulmer', u'ChrisDiehl', u'liebke', u'jakehofman', u'ogrisel', u'mjbommar', u'neilkod', u'floleibert', u'daniel_levine', u'CMastication', u'nikete', u'johnmyleswhite', u'mmparker', u'communicating', u'dataspora', u'revodavid', u'hmason', u'ianpcook', u'statalgo', u'bradfordcross', u'algoriffic', u'parry_joe', u'JeromyAnglim', u'hadleywickham', u'vsbuffalo', u'i2pi', u'gaygoygourmet', u'peteskomoroch', u'wahalulu', u'gutelius', u'mattrepl', u'jeffreyhorner']

	--------

neilkod / overlappers.py

Created August 11, 2010 01:59

	morewillie has 887 friends and 6949 followers
	joelkodner has 990 friends and 1586 followers

	--------

	morewillie and joelkodner have 111 friends in common:
	[u'vinnie', u'wordcampmiami', u'mklopez', u'HazyIT', u'journeyofnow', u'RebekahMonson', u'economist', u'smach241', u'chrisfullman', u'AgustinaP', u'vicequeenmaria', u'bocamike', u'Angiedi', u'pbarbanes', u'malcolli', u'yvetteferry', u'SeaWorld_Parks', u'AlishaVera', u'PalmBchGirl', u'FloriDUHgal', u'KAYENW', u'subeehonee', u'mephjeff', u'vitalex', u'green_architect', u'AllisonNazarian', u'313Nick', u'deadbabydaily', u'Lapp', u'sailgirlcurl', u'amandastewart', u'miamiherald', u'PRDivaRach', u'MisterHirsch', u'bsoler', u'TheTinyJEWELBox', u'UlisesOrozco', u'brianbreslin', u'SternalPR', u'FloridaEats', u'bulabizarro', u'OwenO', u'zsazsaandco', u'andresdavid', u'thewaffle', u'fsutoby', u'ShervinBain', u'BrowardNet', u'elevenser', u'BohoPoetGirl', u'cristnabls', u'Murrayiz', u'johnnybond86', u'jarret23', u'neilkod', u'LoreLama', u'designchicklet', u'moduli

neilkod / top3salariesbydepartment.pig

Created August 23, 2010 18:26

	# load the raw data

	raw = load 'emp.txt' using PigStorage('\t') as (empno:int,ename:chararray,job:chararray,sal:int,deptno:int);

	# group the raw data by deptno. There are only 3 departments(10,20,30)
	grpd = group raw by deptno;

	# for each deptno(grpd), sort the data by sal in descending order, then limit
	# to 3 rows and return the output.

neilkod / thankspython.py

Created August 27, 2010 12:50

	>>> foo = 'hello'
	>>> print foo % 'neil'
	Traceback (most recent call last):
	File "<stdin>", line 1, in <module>
	TypeError: not all arguments converted during string formatting
	>>> foo = 'hello %s'
	>>> print foo % 'neil'
	hello neil
	>>>

neilkod / canabalt.pig

Created August 29, 2010 13:48

canabalt scores

	-- load one day's worth of tweets and find canabalt scores

	register piggybank-0.3-amzn.jar
	DEFINE EXTRACT org.apache.pig.piggybank.evaluation.string.EXTRACT();
	raw = LOAD '20100624.txt' USING PigStorage('\t') AS (id:chararray,timestamp:chararray,screenname:chararray,tweet:chararray);
	fltr = FILTER raw BY tweet matches 'I ran \\d?m before. on my.*http://www.canabalt.com/';
	thedata = FOREACH fltr GENERATE screenname,timestamp,FLATTEN(EXTRACT(tweet,'I ran (\\d?)m before (.) .on my (i.)\\. .*')),id;
	dump thedata

	sample tweet: http://twitter.com/sfbayrealtor/status/16905379203

neilkod / parameter.py

Created September 3, 2010 18:12

	#!/usr/bin/python
	""" prints hello followed by the value of the first command-line parameter """
	import sys
	try:
	print "hello %s" % sys.argv[1]
	except IndexError:
	print "usage: parameter.py name"
	MacBookPro:commonTweeps nkodner$ ./parameter.py
	usage: parameter.py name
	MacBookPro:commonTweeps nkodner$ ./parameter.py neil

neil kodner neilkod