Created
September 24, 2011 20:33
-
-
Save clemesha/1239826 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import json | |
from glob import glob | |
from urlparse import urlparse | |
from itertools import groupby | |
from collections import defaultdict | |
DAY = 15 | |
fileName="usagov_bitly" | |
def parse_dayslice(dataslice_file_name): | |
all_data = [] | |
datapoints_dayslice = open(dataslice_file_name).read().split("\n") | |
win, fail = 0, 0 | |
domain_count = defaultdict(int) | |
urls_for_domain = defaultdict(list) | |
for datapoint in datapoints_dayslice: | |
try: | |
entry = json.loads(datapoint) | |
url = entry["u"] | |
brows = entry["a"] | |
timestamp = entry["t"] | |
try: | |
lat, lon = entry["ll"] | |
win += 1 | |
except: | |
lat, lon = 9999,9999 #value for entries with no lat, long | |
fail += 1 | |
browsval=0 | |
for i in brows: | |
browsval+=ord(i) | |
parsed_result = urlparse(url) | |
root_domain = ".".join(parsed_result.netloc.split(".")[-2:]) | |
domain_count[root_domain] += 1 | |
urls_for_domain[root_domain + "__urls"].append(url) | |
#path_elems = url.replace("https://", "").replace("http://", "").split("/") | |
all_data.append([float(timestamp), root_domain, url, lat, lon, browsval]) | |
#if len(path_elems) > 0: #time.sleep(0.05) | |
#print timestamp, lat, lon, browsval | |
except ValueError: | |
pass #print "Failed to parse:\n", datapoint | |
#print "win, fail: ", win, fail | |
return all_data, domain_count, urls_for_domain | |
def _ordered_urls_for_root_domain(root_domain, data): | |
result = [] | |
for (timestamp, root, url, lat, lon, browsval) in sorted(data, key=lambda x:x[0]): | |
if root == root_domain: | |
result.append([timestamp, root, url, lat, lon, browsval]) | |
return result | |
def full_day_ordered_urls_for_root_domain(root_domain, full_day_datafile_regex): | |
""" Ordered by 'timestamp'. """ | |
result = [] | |
all_dataslice_files = sorted(glob(full_day_datafile_regex)) | |
for filename in all_dataslice_files: | |
print "PROCESSING: ",filename, "..." | |
slice_data, domain_count, urls_for_domain = parse_dayslice(filename) | |
slice_data_ordered = _ordered_urls_for_root_domain(root_domain, slice_data) | |
result.extend(slice_data_ordered) | |
return result | |
def top_urls_for_day_and_root_domain(data, topn=10): | |
only_urls = [e[2] for e in data] | |
unique_urls = set(only_urls) | |
counts = [(e, only_urls.count(e)) for e in unique_urls] | |
result = sorted(counts, key=lambda x:x[1], reverse=True) | |
if len(result) >= topn: | |
return result[0:topn] | |
else: | |
return result | |
def bin_data(full_day_data_topn_only, topn_urls, bins): | |
result = defaultdict(list) | |
delta = (24*60*60) / bins # 21 seconds -- | |
start_timestamp, end_timestamp = full_day_data_topn_only[0][0], full_day_data_topn_only[-1][0] | |
current_bin_max_timestamp = start_timestamp + delta | |
url_count_for_bin = dict([(url, 0) for url in topn_urls]) #reset | |
while full_day_data_topn_only: | |
current_data_element = full_day_data_topn_only.pop(0) | |
current_data_timestamp = current_data_element[0] | |
if current_data_timestamp <= current_bin_max_timestamp: | |
url = current_data_element[2] | |
url_count_for_bin[url] += 1 | |
else: #increment to next bin | |
current_bin_max_timestamp += delta | |
for (url, count) in url_count_for_bin.iteritems(): | |
result[url].append(count) | |
url_count_for_bin = dict([(url, 0) for url in topn_urls]) #reset | |
return result | |
def write_csound_files(data, root_domain, date_str, bins): | |
for (url, clicks) in data.iteritems(): | |
fname = date_str + "-" + root_domain + "-" + url.replace("/","_") + ".cso" | |
fhandle = open(fname, "w") | |
max_clicks = max(clicks) | |
normalized_clicks = [str(float(click)/float(max_clicks)) for click in clicks] | |
clicks_str = " ".join(normalized_clicks) | |
fhandle.write("f 1 0 4096 2 "+clicks_str) | |
fhandle.close() | |
if __name__ == "__main__": | |
print sys.argv | |
if len(sys.argv) == 5: | |
root_domain, topn, date_str, bins = sys.argv[1:] | |
print "Using root_domain='%s', topn='%s', date_str='%s' bins='%s'" % (root_domain, topn, date_str, bins) | |
else: | |
root_domain, topn, date_str, bins = "nasa.gov", "5", "2011-07-25", "4096" | |
print "[DEFAULT] Using root_domain='%s', topn='%s', date_str='%s' bins='%s'" % (root_domain, topn, date_str, bins) | |
topn, bins = int(topn), int(bins) | |
time.sleep(2) | |
glob_str = "usagov_bitly_data%s-*" % date_str | |
_full_day_data = full_day_ordered_urls_for_root_domain(root_domain, glob_str) | |
_top_urls_count = top_urls_for_day_and_root_domain(_full_day_data, topn=topn) | |
_top_urls = [e[0] for e in _top_urls_count] | |
_full_day_data_topn_only = [] | |
print "top_urls: ", _top_urls, "\n" | |
for e in _full_day_data: | |
if (e[2] in _top_urls): | |
_full_day_data_topn_only.append(e) | |
print "full_day_data length: ", len(_full_day_data) | |
print "full_day_data_topn_only length: ", len(_full_day_data_topn_only) | |
time.sleep(1) | |
final_data = bin_data(_full_day_data_topn_only, _top_urls, bins) | |
write_csound_files(final_data, root_domain, date_str, bins) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment