Skip to content

Instantly share code, notes, and snippets.

Created December 27, 2012 01:15
Show Gist options
  • Save anonymous/4384620 to your computer and use it in GitHub Desktop.
Save anonymous/4384620 to your computer and use it in GitHub Desktop.
Try to identify conversations and keywords in irssi logs.
"""Analyze a log of a text communication, looking for distinct discussions."""
from collections import Counter, deque
import re
import string
import sys
LINE_RE = re.compile(r"^(?P<timestamp>[\d:]+)\s"
r"<\W?(?P<nick>[\w|^`[\]]+)>\s"
r"(?P<message>.*)$")
def get_loglines(logfile):
"""Given a file-like object, grab parsed IRC log lines as a list."""
loglines = []
for line in logfile:
m = LINE_RE.match(line.strip())
if not m:
continue
groups = m.groupdict()
# Convert a file's HH:MM or HH:MM:SS timestamps to an integer number
# for ease of arithmetic - I don't actually care about whether the
# resulting value is minutes or seconds as long as it's consistent
groups['timestamp'] = sum(
(60 ** i) * int(value.lstrip('0') or 0) for i,value in
enumerate(reversed(groups['timestamp'].split(':'))))
loglines.append(groups)
return loglines
def break_lines_by_time(lines, min_separation=10, min_deltafrac=3.0, window=5):
"""Break a set of lines into sub-sets of lines based on timestamps."""
assert lines
blocks = []
line_buffer = []
last_timestamps = deque([0]*window, maxlen=window)
last_deltas = deque([0]*window, maxlen=window)
for line in lines:
prev_time = last_timestamps[-1]
time_delta = line['timestamp'] - prev_time
max_allowed_delta = sum(last_deltas) * (min_deltafrac / window)
line['max_delta'] = max_allowed_delta
line['delta'] = time_delta
if time_delta > max_allowed_delta and time_delta > min_separation:
if line_buffer:
blocks.append(line_buffer)
line_buffer = []
last_timestamps.extend([line['timestamp']]*window)
else:
last_timestamps.append(line['timestamp'])
last_deltas.append(time_delta)
line_buffer.append(line)
if line_buffer:
blocks.append(line_buffer)
return blocks
def find_convos(logfile):
"""Identify conversations from a given logfile."""
lines = get_loglines(logfile)
allfreqs = Counter()
for line in lines:
bits = (x.lower().strip('.?"!,;:')
for x in line['message'].split())
bits = (b for b in bits
if len(b) > 2 and b[0] in string.lowercase)
bits = (b for b in bits if not b.startswith('http'))
allfreqs.update(bits)
time_convos = break_lines_by_time(lines)
docfreqs = Counter()
for convo in time_convos:
bits = set()
for line in convo:
bits.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
docfreqs.update(bits)
for convo in time_convos:
convofreqs = Counter()
nickfreqs = Counter()
for line in convo:
#print line['timestamp'], line['nick'], "-->", line['message'], "<<>>", line['delta'], line['max_delta']
convofreqs.update(x.lower().strip('.?"!,:;') for x in line['message'].split())
nickfreqs.update([line['nick']])
signifs = sorted(((float(convofreqs[k]) / (docfreqs[k] ** 2), k)
for k in convofreqs if allfreqs[k] > 0), reverse=True)
duration = convo[-1]['timestamp'] - convo[0]['timestamp']
if duration > 5:
print "="*80
print convo[0]['timestamp'], "to", convo[-1]['timestamp'], ", duration:", duration
print "Keywords:", ', '.join(x[1] for x in signifs[:10])
print "Speakers:", ', '.join("%s(%d)" % (x[0], x[1]) for x in nickfreqs.most_common(10))
print "="*80
def main():
filename = sys.argv[1]
with open(filename) as f:
find_convos(f)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment