Skip to content

Instantly share code, notes, and snippets.

@jacobian
Created May 8, 2012 12:37
Show Gist options
  • Save jacobian/2634610 to your computer and use it in GitHub Desktop.
Save jacobian/2634610 to your computer and use it in GitHub Desktop.
import glob
import gzip
def top_paths(filename):
# Count the top paths in a single gziped file
counts = {}
with gzip.open(filename) as f:
for line in f:
path = line.split()[6]
counts[path] = counts.get(path, 0) + 1
return counts
# Create a grand total by suming the subtotals from each
# file in Data/logs.
totals = {}
for filename in glob.glob("Data/logs/*.gz"):
for path, count in top_paths(filename).items():
totals[path] = totals.get(path, 0) + count
# We can't sort a dictionary, so to sort we have to
# convert this structure into a better format.
# totals.items() can turn the dict into tuples, but
# those are (url, count) pairs. list.sort() will
# sort that, but it'll sort by the first item (the url);
# we want to sort by the count. So we'll iterate and
# create (count, url) pairs instead:
totals_by_count = []
for path, count in totals.items():
totals_by_count.append((count, path))
# Now we can sort - remember to reverse so bigger numbers
# come first.
totals_by_count.sort(reverse=True)
# And display the top 20 items:
for count, path in totals_by_count[:20]:
print "%10s %s" % (count, path)
import glob
import gzip
import collections
def top_paths(filename):
counts = collections.Counter()
with gzip.open(filename) as f:
counts.update(line.split()[6] for line in f)
return counts
totals = collections.Counter()
for filename in glob.glob("Data/logs/*.gz"):
totals.update(top_paths(filename))
for path, count in totals.most_common(n=20):
print "%10s %s" % (count, path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment