Created
September 10, 2012 11:17
-
-
Save sergray/3690384 to your computer and use it in GitHub Desktop.
Script for combining access logs and outputting their records sorted by time
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Combines provided access logs and outputs records sorted by | |
time of record. | |
Time of the record is expected to be the forth element in the | |
record and must have +0000 timezone. | |
""" | |
import logging | |
from datetime import datetime | |
TIME_FORMAT = '%d/%b/%Y:%H:%M:%S +0000' | |
TIME_IDX = 3 # zero-base index of datetime in access log record | |
NEXT_CHAR = { | |
'[': ']', | |
'"': '"', | |
} | |
def parse_record(log_record): | |
"""Returns parts of log_record string as tuple. | |
Does not make any validation of provided log_record. | |
""" | |
tmp_parts = log_record.split(' ') | |
parts = [] | |
acc = [] # accumulator | |
idx = 0 | |
while idx < len(tmp_parts): | |
part = tmp_parts[idx] | |
part_chr0 = part[0] | |
if part_chr0 in NEXT_CHAR: | |
if part[-1] != NEXT_CHAR[part_chr0]: | |
acc.append(part) | |
else: | |
parts.append(part[1:-1]) | |
elif acc: | |
acc_chr0 = acc[0][0] | |
acc.append(part) | |
if part.endswith(NEXT_CHAR[acc_chr0]): | |
val = ' '.join(acc)[1:-1] | |
if acc_chr0 == '[': | |
val = datetime.strptime(val, TIME_FORMAT) | |
parts.append(val) | |
acc = [] | |
else: | |
parts.append(part) | |
idx += 1 | |
return tuple(parts) | |
def merged(*sources): | |
"""Generator of merged access log records sorted by time. | |
Accepts file-like objects as positional arguments. | |
Yields tuples with index of the source and its line. | |
""" | |
line_src = [] | |
sorted_src = [] | |
for idx, src in enumerate(sources): | |
# read first strings | |
try: | |
str_rec = src.next() | |
except StopIteration: | |
continue | |
rec = parse_record(str_rec) | |
line_src.append(str_rec) | |
sorted_src.append((rec, idx)) | |
sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime | |
while sorted_src: | |
curr_rec, curr_idx = sorted_src[0] | |
src = sources[curr_idx] | |
try: | |
next_rec, next_idx = sorted_src[1] | |
except IndexError: | |
next_idx = None | |
if next_idx is None: | |
for src_rec in src: | |
yield curr_idx, src_rec.rstrip('\n') | |
sorted_src = [] | |
continue | |
while curr_rec[TIME_IDX] <= next_rec[TIME_IDX]: | |
yield curr_idx, line_src[curr_idx].rstrip('\n') | |
try: | |
str_rec = src.next() | |
except StopIteration: | |
sorted_src = sorted_src[1:] | |
break | |
curr_rec = parse_record(str_rec) | |
line_src[curr_idx] = str_rec | |
sorted_src[0] = (curr_rec, curr_idx) | |
sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime | |
if __name__ == '__main__': | |
import sys | |
sources = [] | |
paths = sys.argv[1:] | |
for path in paths: | |
try: | |
fobj = file(path) | |
except IOError, exc: | |
logging.error('Ignore %s: %s', path, exc) | |
else: | |
sources.append(fobj) | |
merged_logs = merged(*sources) | |
for idx, rec in merged_logs: | |
print "%s:%s" % (paths[idx], rec) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment