This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
from collections import defaultdict | |
cnts = defaultdict(int) | |
for line in sys.stdin: | |
line = line.strip() | |
k,v = line.split('\t') | |
(item1,item2)=k.split(',') | |
cnts[(int(item1),int(item2))] += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# reducer.py | |
# input data looks like | |
# (1,2) 1 | |
# (2,3) 1 | |
# (1,3) 1 | |
# (1,3) 1 | |
# need to count each group and sum the totals. | |
#!/usr/bin/python |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
192:commonTweeps nkodner$ ./overlap.py | |
mrflip has 881 friends and 1612 followers | |
thedatachef has 26 friends and 78 followers | |
-------- | |
mrflip and thedatachef have 21 friends in common: | |
[u'wattsteve', u'dhruvbansal', u'flowingdata', u'kevinweil', u'peteskomoroch', u'squarecog', u'jessecrouch', u'neilkod', u'dataspora', u'nickducoff', u'nicktj', u'jeromatron', u'doncarlo', u'habcous', u'mat_kelcey', u'sarahnordquist', u'mndoci', u'josephkelly', u'fricanosdeli', u'steveodom', u'infochimps'] | |
-------- |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
192:commonTweeps nkodner$ time ./overlap.py | |
drewconway has 171 friends and 1141 followers | |
datajunkie has 381 friends and 881 followers | |
-------- | |
drewconway and datajunkie have 35 friends in common: | |
[u'lewisshepherd', u'mrflip', u'gappy3000', u'hamiltonulmer', u'ChrisDiehl', u'liebke', u'jakehofman', u'ogrisel', u'mjbommar', u'neilkod', u'floleibert', u'daniel_levine', u'CMastication', u'nikete', u'johnmyleswhite', u'mmparker', u'communicating', u'dataspora', u'revodavid', u'hmason', u'ianpcook', u'statalgo', u'bradfordcross', u'algoriffic', u'parry_joe', u'JeromyAnglim', u'hadleywickham', u'vsbuffalo', u'i2pi', u'gaygoygourmet', u'peteskomoroch', u'wahalulu', u'gutelius', u'mattrepl', u'jeffreyhorner'] | |
-------- |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
morewillie has 887 friends and 6949 followers | |
joelkodner has 990 friends and 1586 followers | |
-------- | |
morewillie and joelkodner have 111 friends in common: | |
[u'vinnie', u'wordcampmiami', u'mklopez', u'HazyIT', u'journeyofnow', u'RebekahMonson', u'economist', u'smach241', u'chrisfullman', u'AgustinaP', u'vicequeenmaria', u'bocamike', u'Angiedi', u'pbarbanes', u'malcolli', u'yvetteferry', u'SeaWorld_Parks', u'AlishaVera', u'PalmBchGirl', u'FloriDUHgal', u'KAYENW', u'subeehonee', u'mephjeff', u'vitalex', u'green_architect', u'AllisonNazarian', u'313Nick', u'deadbabydaily', u'Lapp', u'sailgirlcurl', u'amandastewart', u'miamiherald', u'PRDivaRach', u'MisterHirsch', u'bsoler', u'TheTinyJEWELBox', u'UlisesOrozco', u'brianbreslin', u'SternalPR', u'FloridaEats', u'bulabizarro', u'OwenO', u'zsazsaandco', u'andresdavid', u'thewaffle', u'fsutoby', u'ShervinBain', u'BrowardNet', u'elevenser', u'BohoPoetGirl', u'cristnabls', u'Murrayiz', u'johnnybond86', u'jarret23', u'neilkod', u'LoreLama', u'designchicklet', u'moduli |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load the raw data | |
raw = load 'emp.txt' using PigStorage('\t') as (empno:int,ename:chararray,job:chararray,sal:int,deptno:int); | |
# group the raw data by deptno. There are only 3 departments(10,20,30) | |
grpd = group raw by deptno; | |
# for each deptno(grpd), sort the data by sal in descending order, then limit | |
# to 3 rows and return the output. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> foo = 'hello' | |
>>> print foo % 'neil' | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
TypeError: not all arguments converted during string formatting | |
>>> foo = 'hello %s' | |
>>> print foo % 'neil' | |
hello neil | |
>>> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- load one day's worth of tweets and find canabalt scores | |
register piggybank-0.3-amzn.jar | |
DEFINE EXTRACT org.apache.pig.piggybank.evaluation.string.EXTRACT(); | |
raw = LOAD '20100624.txt' USING PigStorage('\t') AS (id:chararray,timestamp:chararray,screenname:chararray,tweet:chararray); | |
fltr = FILTER raw BY tweet matches 'I ran \\d*?m before.* on my.*http://www.canabalt.com/'; | |
thedata = FOREACH fltr GENERATE screenname,timestamp,FLATTEN(EXTRACT(tweet,'I ran (\\d*?)m before (.*) .*on my (i.*)\\. .*')),id; | |
dump thedata | |
sample tweet: http://twitter.com/sfbayrealtor/status/16905379203 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" prints hello followed by the value of the first command-line parameter """ | |
import sys | |
try: | |
print "hello %s" % sys.argv[1] | |
except IndexError: | |
print "usage: parameter.py name" | |
MacBookPro:commonTweeps nkodner$ ./parameter.py | |
usage: parameter.py name | |
MacBookPro:commonTweeps nkodner$ ./parameter.py neil |