Created
December 27, 2012 01:15
-
-
Save anonymous/4384620 to your computer and use it in GitHub Desktop.
Try to identify conversations and keywords in irssi logs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Analyze a log of a text communication, looking for distinct discussions.""" | |
from collections import Counter, deque | |
import re | |
import string | |
import sys | |
LINE_RE = re.compile(r"^(?P<timestamp>[\d:]+)\s" | |
r"<\W?(?P<nick>[\w|^`[\]]+)>\s" | |
r"(?P<message>.*)$") | |
def get_loglines(logfile): | |
"""Given a file-like object, grab parsed IRC log lines as a list.""" | |
loglines = [] | |
for line in logfile: | |
m = LINE_RE.match(line.strip()) | |
if not m: | |
continue | |
groups = m.groupdict() | |
# Convert a file's HH:MM or HH:MM:SS timestamps to an integer number | |
# for ease of arithmetic - I don't actually care about whether the | |
# resulting value is minutes or seconds as long as it's consistent | |
groups['timestamp'] = sum( | |
(60 ** i) * int(value.lstrip('0') or 0) for i,value in | |
enumerate(reversed(groups['timestamp'].split(':')))) | |
loglines.append(groups) | |
return loglines | |
def break_lines_by_time(lines, min_separation=10, min_deltafrac=3.0, window=5): | |
"""Break a set of lines into sub-sets of lines based on timestamps.""" | |
assert lines | |
blocks = [] | |
line_buffer = [] | |
last_timestamps = deque([0]*window, maxlen=window) | |
last_deltas = deque([0]*window, maxlen=window) | |
for line in lines: | |
prev_time = last_timestamps[-1] | |
time_delta = line['timestamp'] - prev_time | |
max_allowed_delta = sum(last_deltas) * (min_deltafrac / window) | |
line['max_delta'] = max_allowed_delta | |
line['delta'] = time_delta | |
if time_delta > max_allowed_delta and time_delta > min_separation: | |
if line_buffer: | |
blocks.append(line_buffer) | |
line_buffer = [] | |
last_timestamps.extend([line['timestamp']]*window) | |
else: | |
last_timestamps.append(line['timestamp']) | |
last_deltas.append(time_delta) | |
line_buffer.append(line) | |
if line_buffer: | |
blocks.append(line_buffer) | |
return blocks | |
def find_convos(logfile): | |
"""Identify conversations from a given logfile.""" | |
lines = get_loglines(logfile) | |
allfreqs = Counter() | |
for line in lines: | |
bits = (x.lower().strip('.?"!,;:') | |
for x in line['message'].split()) | |
bits = (b for b in bits | |
if len(b) > 2 and b[0] in string.lowercase) | |
bits = (b for b in bits if not b.startswith('http')) | |
allfreqs.update(bits) | |
time_convos = break_lines_by_time(lines) | |
docfreqs = Counter() | |
for convo in time_convos: | |
bits = set() | |
for line in convo: | |
bits.update(x.lower().strip('.?"!,:;') for x in line['message'].split()) | |
docfreqs.update(bits) | |
for convo in time_convos: | |
convofreqs = Counter() | |
nickfreqs = Counter() | |
for line in convo: | |
#print line['timestamp'], line['nick'], "-->", line['message'], "<<>>", line['delta'], line['max_delta'] | |
convofreqs.update(x.lower().strip('.?"!,:;') for x in line['message'].split()) | |
nickfreqs.update([line['nick']]) | |
signifs = sorted(((float(convofreqs[k]) / (docfreqs[k] ** 2), k) | |
for k in convofreqs if allfreqs[k] > 0), reverse=True) | |
duration = convo[-1]['timestamp'] - convo[0]['timestamp'] | |
if duration > 5: | |
print "="*80 | |
print convo[0]['timestamp'], "to", convo[-1]['timestamp'], ", duration:", duration | |
print "Keywords:", ', '.join(x[1] for x in signifs[:10]) | |
print "Speakers:", ', '.join("%s(%d)" % (x[0], x[1]) for x in nickfreqs.most_common(10)) | |
print "="*80 | |
def main(): | |
filename = sys.argv[1] | |
with open(filename) as f: | |
find_convos(f) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment