Instantly share code, notes, and snippets.
Created
August 13, 2013 21:17
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save daviddahl/6225790 to your computer and use it in GitHub Desktop.
Processing code for cookie study stored in Hadoop
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# A=cookiemonsterdata; hdfs dfs -text "$A"/*snappy | cut -f3- | process_cookie_data.py > cookie_data.json | |
import sys | |
import json | |
data = { | |
"set_cookie_count": [], | |
"maxage": [], | |
"domain_map": {}, | |
"domains": [], | |
"domain_map_i": 0, | |
"dp_map": {}, | |
"pairs": [], | |
"pair_counts": {}, | |
"domain_pair_map_i": 0, | |
"domain_widget_map": {}, | |
"domain_widgets": [], | |
"widgets": [], | |
"share_widget_map": {}, | |
"share_domain_widgets": [], | |
"share_widgets": [] | |
} | |
def set_cookie_count(js): | |
try: | |
if js["eventType"] == "set-cookie": | |
data["set_cookie_count"].append(int(js["count"])) | |
except KeyError, e: | |
sys.stderr.write("KeyError in line: %s" % json.dumps(js)) | |
# Process additional data that relates to set-cookie: | |
expiry_data(js) | |
domain_map(js) | |
domain_pair_map(js) | |
def process_data(js): | |
"""Process each kind of data encountered""" | |
data = js["data"] | |
def not_found(data): | |
sys.stderr.write("%s action not found." % data["eventType"]) | |
def switch(data): | |
return { | |
'set-cookie': set_cookie_count, | |
'SHARE_URL_LOADED': share_widgets, | |
'SOCIAL_WIDGET_LOADED': social_widgets, | |
}.get(data["eventType"], not_found) | |
func = switch(data) | |
func(data) | |
def write_output(data): | |
file = open("output.json", "write") | |
file.write(json.dumps(data)) | |
file.flush() | |
file.close() | |
def expiry_data(js): | |
maxage = [] | |
try: | |
if js["maxage"]: | |
data["maxage"].append(int(js["maxage"])) | |
except: | |
pass | |
def domain_pair_map(js): | |
i = data["domain_pair_map_i"] | |
try: | |
pair = "&".join([js["domain"], js["referrer"],]) | |
if pair not in data["pairs"]: | |
data["pairs"].append(pair) | |
data["dp_map"][i] = pair | |
data["pair_counts"][pair] = 1 | |
i += 1 | |
else: | |
data["pair_counts"][pair] = data["pair_counts"][pair] + 1 | |
except Exception, e: | |
sys.stderr.write(str(e)) | |
pass | |
def domain_map(js): | |
"""Map out the domains so we can reference them by number""" | |
i = data["domain_map_i"] | |
try: | |
if js["domain"] not in data["domains"]: | |
d = js["domain"] | |
data["domain_map"][i] = d | |
data["domains"].append(d) | |
i += 1 | |
except Exception, e: | |
sys.stderr.write(str(e)) | |
pass | |
def social_widgets(js): | |
"""Make SOCIAL_WIDGET data pie-chartable""" | |
widgets = data["widgets"] | |
domain_widgets = data["domain_widgets"] | |
domain_widget_map = data["domain_widget_map"] | |
try: | |
if js["widget"] not in widgets: | |
wid = {"widget": js["widget"], "value": 1} | |
domain_widgets.append(wid) | |
w = js["widget"] | |
domain_widget_map[w] = 1 | |
widgets.append(w) | |
else: | |
w = js["widget"] | |
for widge in domain_widgets: | |
if widge["widget"] == w: | |
widge["value"] = widge["value"] + 1 | |
except Exception, e: | |
sys.stderr.write(str(e)) | |
def share_widgets(js): | |
"""Make SHARE_WIDGET data pie-chartable""" | |
share_widget_map = data["share_widget_map"] | |
share_domain_widgets = data["share_domain_widgets"] | |
share_widgets = data["share_widgets"] | |
try: | |
if js["eventType"] == "SHARE_URL_LOADED": | |
if js["shareURL"] not in share_widgets: | |
wid = {"shareURL": js["shareURL"], "value": 1} | |
share_domain_widgets.append(wid) | |
w = js["shareURL"] | |
share_widget_map[w] = 1 | |
share_widgets.append(w) | |
else: | |
w = js["shareURL"] | |
for widge in share_domain_widgets: | |
if widge["shareURL"] == w: | |
widge["value"] = widge["value"] + 1 | |
except Exception, e: | |
sys.stdout.write(str(e)) | |
if __name__ == "__main__": | |
for line in sys.stdin: | |
js = json.loads(line) | |
process_data(js) | |
write_output(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment