Skip to content

Instantly share code, notes, and snippets.

@t3rmin4t0r
Created July 27, 2015 23:52
Show Gist options
  • Save t3rmin4t0r/aff31f525a2c26231aeb to your computer and use it in GitHub Desktop.
Save t3rmin4t0r/aff31f525a2c26231aeb to your computer and use it in GitHub Desktop.
Extract list of files created per-application for the HDFS NN (from logs)
import re
import sys, math, os.path
from glob import glob
from itertools import groupby,chain
from collections import defaultdict
import re
def parse(f):
PAT = re.compile(r'DIR\* completeFile: ([^ ]*) is closed by ([^ ]*)')
sys.stderr.write("Reading %s\n" % f)
with open(f) as fp:
for l in fp:
m = PAT.search(l.strip())
if m:
yield (m.group(2),m.group(1))
elif "StateChange: DIR" in l:
print l
def main(args):
if not args:
files = glob("/var/log/hadoop/hdfs/hadoop-hdfs-namenode-*.log")
else:
files = args
completed = list(chain(*[parse(f) for f in files]))
tasks = filter(lambda (a,b) : a.startswith("DFSClient_attempt"), completed)
other = filter(lambda (a,b) : not a.startswith("DFSClient_attempt"), completed)
app = lambda a : "application_" + ("_".join(a.split("_")[2:4]))
d = defaultdict(int)
for (a,b) in tasks:
d[app(a)] += 1
l = sorted([(b,a) for (a,b) in list(d.items())])
top20 = l[-20:]
top20.reverse()
for i,a in top20:
print a, i
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment