Skip to content

Instantly share code, notes, and snippets.

View neilkod's full-sized avatar

neil kodner neilkod

View GitHub Profile
#!/usr/bin/python
import sys
from collections import defaultdict
cnts = defaultdict(int)
for line in sys.stdin:
line = line.strip()
k,v = line.split('\t')
(item1,item2)=k.split(',')
cnts[(int(item1),int(item2))] += 1
# reducer.py
# input data looks like
# (1,2) 1
# (2,3) 1
# (1,3) 1
# (1,3) 1
# need to count each group and sum the totals.
#!/usr/bin/python
register piggybank.jar
raw = LOAD 'parsed/' USING PigStorage('\t') AS (id:chararray,timestamp:chararray,screenname:chararray,tweet:chararray);
-- filter the tweets to only the ones that have a hashtag containing the f-word
fltr = FILTER raw BY tweet matches '.*\\#\\p{Alpha}*[Ff][Uu][Cc][Kk].*?';
--extract the actual regex-matched hashtag. note-there has to be a better way to do this
extrctd = FOREACH fltr GENERATE FLATTEN(org.apache.pig.piggybank.evaluation.string.RegexExtract(tweet,'.*(\\#\\p{Alpha}*[Ff][Uu][Cc][Kk].*?\\b)',1)) as (tweet:chararray);
--convert to lowercase, group, and sort.
lowrd = FOREACH extrctd GENERATE FLATTEN(org.apache.pig.piggybank.evaluation.string.LOWER(tweet));
grpd = GROUP lowrd by $0;
cntd = FOREACH grpd GENERATE $0 as theHour,COUNT(lowrd) as cnt;
192:commonTweeps nkodner$ ./overlap.py
mrflip has 881 friends and 1612 followers
thedatachef has 26 friends and 78 followers
--------
mrflip and thedatachef have 21 friends in common:
[u'wattsteve', u'dhruvbansal', u'flowingdata', u'kevinweil', u'peteskomoroch', u'squarecog', u'jessecrouch', u'neilkod', u'dataspora', u'nickducoff', u'nicktj', u'jeromatron', u'doncarlo', u'habcous', u'mat_kelcey', u'sarahnordquist', u'mndoci', u'josephkelly', u'fricanosdeli', u'steveodom', u'infochimps']
--------
192:commonTweeps nkodner$ time ./overlap.py
drewconway has 171 friends and 1141 followers
datajunkie has 381 friends and 881 followers
--------
drewconway and datajunkie have 35 friends in common:
[u'lewisshepherd', u'mrflip', u'gappy3000', u'hamiltonulmer', u'ChrisDiehl', u'liebke', u'jakehofman', u'ogrisel', u'mjbommar', u'neilkod', u'floleibert', u'daniel_levine', u'CMastication', u'nikete', u'johnmyleswhite', u'mmparker', u'communicating', u'dataspora', u'revodavid', u'hmason', u'ianpcook', u'statalgo', u'bradfordcross', u'algoriffic', u'parry_joe', u'JeromyAnglim', u'hadleywickham', u'vsbuffalo', u'i2pi', u'gaygoygourmet', u'peteskomoroch', u'wahalulu', u'gutelius', u'mattrepl', u'jeffreyhorner']
--------
morewillie has 887 friends and 6949 followers
joelkodner has 990 friends and 1586 followers
--------
morewillie and joelkodner have 111 friends in common:
[u'vinnie', u'wordcampmiami', u'mklopez', u'HazyIT', u'journeyofnow', u'RebekahMonson', u'economist', u'smach241', u'chrisfullman', u'AgustinaP', u'vicequeenmaria', u'bocamike', u'Angiedi', u'pbarbanes', u'malcolli', u'yvetteferry', u'SeaWorld_Parks', u'AlishaVera', u'PalmBchGirl', u'FloriDUHgal', u'KAYENW', u'subeehonee', u'mephjeff', u'vitalex', u'green_architect', u'AllisonNazarian', u'313Nick', u'deadbabydaily', u'Lapp', u'sailgirlcurl', u'amandastewart', u'miamiherald', u'PRDivaRach', u'MisterHirsch', u'bsoler', u'TheTinyJEWELBox', u'UlisesOrozco', u'brianbreslin', u'SternalPR', u'FloridaEats', u'bulabizarro', u'OwenO', u'zsazsaandco', u'andresdavid', u'thewaffle', u'fsutoby', u'ShervinBain', u'BrowardNet', u'elevenser', u'BohoPoetGirl', u'cristnabls', u'Murrayiz', u'johnnybond86', u'jarret23', u'neilkod', u'LoreLama', u'designchicklet', u'moduli
# load the raw data
raw = load 'emp.txt' using PigStorage('\t') as (empno:int,ename:chararray,job:chararray,sal:int,deptno:int);
# group the raw data by deptno. There are only 3 departments(10,20,30)
grpd = group raw by deptno;
# for each deptno(grpd), sort the data by sal in descending order, then limit
# to 3 rows and return the output.
>>> foo = 'hello'
>>> print foo % 'neil'
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: not all arguments converted during string formatting
>>> foo = 'hello %s'
>>> print foo % 'neil'
hello neil
>>>
@neilkod
neilkod / canabalt.pig
Created August 29, 2010 13:48
canabalt scores
-- load one day's worth of tweets and find canabalt scores
register piggybank-0.3-amzn.jar
DEFINE EXTRACT org.apache.pig.piggybank.evaluation.string.EXTRACT();
raw = LOAD '20100624.txt' USING PigStorage('\t') AS (id:chararray,timestamp:chararray,screenname:chararray,tweet:chararray);
fltr = FILTER raw BY tweet matches 'I ran \\d*?m before.* on my.*http://www.canabalt.com/';
thedata = FOREACH fltr GENERATE screenname,timestamp,FLATTEN(EXTRACT(tweet,'I ran (\\d*?)m before (.*) .*on my (i.*)\\. .*')),id;
dump thedata
sample tweet: http://twitter.com/sfbayrealtor/status/16905379203
#!/usr/bin/python
""" prints hello followed by the value of the first command-line parameter """
import sys
try:
print "hello %s" % sys.argv[1]
except IndexError:
print "usage: parameter.py name"
MacBookPro:commonTweeps nkodner$ ./parameter.py
usage: parameter.py name
MacBookPro:commonTweeps nkodner$ ./parameter.py neil