sergray · September 10, 2012 11:17
diff --git a/merge_logs.py b/merge_logs.py
 #!/usr/bin/env python
 """
 Combines provided access logs and outputs records sorted by
 time of record.

 Time of the record is expected to be the forth element in the
 record and must have +0000 timezone.
 """

 import logging
 from datetime import datetime

 TIME_FORMAT = '%d/%b/%Y:%H:%M:%S +0000'

 TIME_IDX = 3  # zero-base index of datetime in access log record

 NEXT_CHAR = {
    '[': ']',
    '"': '"',
 }


 def parse_record(log_record):
    """Returns parts of log_record string as tuple.

    Does not make any validation of provided log_record.
    """
    tmp_parts = log_record.split(' ')
    parts = []
    acc = []  # accumulator
    idx = 0
    while idx < len(tmp_parts):
        part = tmp_parts[idx]
        part_chr0 = part[0]
        if part_chr0 in NEXT_CHAR:
            if part[-1] != NEXT_CHAR[part_chr0]:
                acc.append(part)
            else:
                parts.append(part[1:-1])
        elif acc:
            acc_chr0 = acc[0][0]
            acc.append(part)
            if part.endswith(NEXT_CHAR[acc_chr0]):
                val = ' '.join(acc)[1:-1]
                if acc_chr0 == '[':
                    val = datetime.strptime(val, TIME_FORMAT)
                parts.append(val)
                acc = []
        else:
            parts.append(part)
        idx += 1
    return tuple(parts)


 def merged(*sources):
    """Generator of merged access log records sorted by time.

    Accepts file-like objects as positional arguments.

    Yields tuples with index of the source and its line.
    """
    line_src = []
    sorted_src = []
    for idx, src in enumerate(sources):
        # read first strings
        try:
            str_rec = src.next()
        except StopIteration:
            continue
        rec = parse_record(str_rec)
        line_src.append(str_rec)
        sorted_src.append((rec, idx))
    sorted_src.sort(key=lambda it: it[0][TIME_IDX])  # by datetime

    while sorted_src:
        curr_rec, curr_idx = sorted_src[0]
        src = sources[curr_idx]
        try:
            next_rec, next_idx = sorted_src[1]
        except IndexError:
            next_idx = None
        if next_idx is None:
            for src_rec in src:
                yield curr_idx, src_rec.rstrip('\n')
            sorted_src = []
            continue
        while curr_rec[TIME_IDX] <= next_rec[TIME_IDX]:
            yield curr_idx, line_src[curr_idx].rstrip('\n')
            try:
                str_rec = src.next()
            except StopIteration:
                sorted_src = sorted_src[1:]
                break
            curr_rec = parse_record(str_rec)
            line_src[curr_idx] = str_rec
            sorted_src[0] = (curr_rec, curr_idx)
        sorted_src.sort(key=lambda it: it[0][TIME_IDX])  # by datetime


 if __name__ == '__main__':
    import sys
    sources = []
    paths = sys.argv[1:]
    for path in paths:
        try:
            fobj = file(path)
        except IOError, exc:
            logging.error('Ignore %s: %s', path, exc)
        else:
            sources.append(fobj)
    merged_logs = merged(*sources)
    for idx, rec in merged_logs:
        print "%s:%s" % (paths[idx], rec)
	#!/usr/bin/env python
	"""
	Combines provided access logs and outputs records sorted by
	time of record.

	Time of the record is expected to be the forth element in the
	record and must have +0000 timezone.
	"""

	import logging
	from datetime import datetime

	TIME_FORMAT = '%d/%b/%Y:%H:%M:%S +0000'

	TIME_IDX = 3 # zero-base index of datetime in access log record

	NEXT_CHAR = {
	'[': ']',
	'"': '"',
	}


	def parse_record(log_record):
	"""Returns parts of log_record string as tuple.

	Does not make any validation of provided log_record.
	"""
	tmp_parts = log_record.split(' ')
	parts = []
	acc = [] # accumulator
	idx = 0
	while idx < len(tmp_parts):
	part = tmp_parts[idx]
	part_chr0 = part[0]
	if part_chr0 in NEXT_CHAR:
	if part[-1] != NEXT_CHAR[part_chr0]:
	acc.append(part)
	else:
	parts.append(part[1:-1])
	elif acc:
	acc_chr0 = acc[0][0]
	acc.append(part)
	if part.endswith(NEXT_CHAR[acc_chr0]):
	val = ' '.join(acc)[1:-1]
	if acc_chr0 == '[':
	val = datetime.strptime(val, TIME_FORMAT)
	parts.append(val)
	acc = []
	else:
	parts.append(part)
	idx += 1
	return tuple(parts)


	def merged(*sources):
	"""Generator of merged access log records sorted by time.

	Accepts file-like objects as positional arguments.

	Yields tuples with index of the source and its line.
	"""
	line_src = []
	sorted_src = []
	for idx, src in enumerate(sources):
	# read first strings
	try:
	str_rec = src.next()
	except StopIteration:
	continue
	rec = parse_record(str_rec)
	line_src.append(str_rec)
	sorted_src.append((rec, idx))
	sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime

	while sorted_src:
	curr_rec, curr_idx = sorted_src[0]
	src = sources[curr_idx]
	try:
	next_rec, next_idx = sorted_src[1]
	except IndexError:
	next_idx = None
	if next_idx is None:
	for src_rec in src:
	yield curr_idx, src_rec.rstrip('\n')
	sorted_src = []
	continue
	while curr_rec[TIME_IDX] <= next_rec[TIME_IDX]:
	yield curr_idx, line_src[curr_idx].rstrip('\n')
	try:
	str_rec = src.next()
	except StopIteration:
	sorted_src = sorted_src[1:]
	break
	curr_rec = parse_record(str_rec)
	line_src[curr_idx] = str_rec
	sorted_src[0] = (curr_rec, curr_idx)
	sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime


	if __name__ == '__main__':
	import sys
	sources = []
	paths = sys.argv[1:]
	for path in paths:
	try:
	fobj = file(path)
	except IOError, exc:
	logging.error('Ignore %s: %s', path, exc)
	else:
	sources.append(fobj)
	merged_logs = merged(*sources)
	for idx, rec in merged_logs:
	print "%s:%s" % (paths[idx], rec)
No results found