Created
May 8, 2012 12:37
-
-
Save jacobian/2634610 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import gzip | |
def top_paths(filename): | |
# Count the top paths in a single gziped file | |
counts = {} | |
with gzip.open(filename) as f: | |
for line in f: | |
path = line.split()[6] | |
counts[path] = counts.get(path, 0) + 1 | |
return counts | |
# Create a grand total by suming the subtotals from each | |
# file in Data/logs. | |
totals = {} | |
for filename in glob.glob("Data/logs/*.gz"): | |
for path, count in top_paths(filename).items(): | |
totals[path] = totals.get(path, 0) + count | |
# We can't sort a dictionary, so to sort we have to | |
# convert this structure into a better format. | |
# totals.items() can turn the dict into tuples, but | |
# those are (url, count) pairs. list.sort() will | |
# sort that, but it'll sort by the first item (the url); | |
# we want to sort by the count. So we'll iterate and | |
# create (count, url) pairs instead: | |
totals_by_count = [] | |
for path, count in totals.items(): | |
totals_by_count.append((count, path)) | |
# Now we can sort - remember to reverse so bigger numbers | |
# come first. | |
totals_by_count.sort(reverse=True) | |
# And display the top 20 items: | |
for count, path in totals_by_count[:20]: | |
print "%10s %s" % (count, path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import gzip | |
import collections | |
def top_paths(filename): | |
counts = collections.Counter() | |
with gzip.open(filename) as f: | |
counts.update(line.split()[6] for line in f) | |
return counts | |
totals = collections.Counter() | |
for filename in glob.glob("Data/logs/*.gz"): | |
totals.update(top_paths(filename)) | |
for path, count in totals.most_common(n=20): | |
print "%10s %s" % (count, path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment