Created
April 22, 2013 01:53
-
-
Save DrDub/5431945 to your computer and use it in GitHub Desktop.
Preprocessing of #tikiwiki logs for use with the chat disentangler available at http://www.ling.ohio-state.edu/~melsner/resources/chat-manual.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#converts a gaim chatlog to a more ethical anonymized version | |
#format of the output is | |
#[datestamp timestamp] <name> comment | |
#or | |
#[datestamp timestamp] *** name action | |
from random import shuffle | |
from sys import argv | |
import re | |
#from time import * | |
import datetime | |
def timeToSecs(timeStr, prevTime): | |
t = datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M") | |
seconds = long((t-datetime.datetime(1970,1,1)).total_seconds()) | |
#(year,month,day,hr,min) = timeTuple[1:5] | |
while seconds <= prevTime: | |
seconds += 1 | |
return seconds | |
#read the names list (got from the US census and preprocessed a little bit) | |
nameFile = "data/names" | |
names = [x.rstrip().title() for x in file(nameFile).readlines()] | |
shuffle(names) | |
#load up the file | |
chatFile = argv[1] | |
chat = file(chatFile) | |
print "Processing", chatFile | |
aliases = {} | |
#the intro line (my name, server id, true start time) | |
#intro = chat.readline() | |
#trueTime = re.search("at \S+ ([\d:]+)", intro) | |
#assert(trueTime) | |
#trueTime = timeToSecs(trueTime.group(1), 0) | |
#channelName = re.search("Conversation with (\S+)", intro) | |
#assert(channelName) | |
channelName = "tiki" #channelName.group(1) | |
aliases[channelName] = channelName | |
#epoch = trueTime | |
trueTime = 0 | |
basicRE = re.compile("\[([^\]]+)\] ((<([^>]+)>)|\*\*?\*? ([^\s]+))(.*)") | |
for line in chat: | |
match = basicRE.match(line) | |
assert(match) | |
(time, full_name, name_comment, ignore, action_name, rest) = match.groups() | |
name = name_comment if name_comment else action_name | |
#print line, "time", time, "full_name", full_name, "name_comment", name_comment, "action_name", action_name, "ignore", ignore, "rest", rest | |
trueTime = timeToSecs(time, trueTime) | |
try: | |
alias = aliases[name] | |
except KeyError: | |
alias = names.pop() | |
aliases[name] = alias | |
if "is now known as" not in rest: | |
#obnoxiously, people can readopt others' nicknames | |
#and then you get cross-aliasing | |
for name in aliases.keys(): | |
if name in rest: | |
namepatt = re.compile("(^|[^a-zA-Z]+)%s([^a-zA-Z]+|$)" % | |
re.escape(name)) | |
if re.search(namepatt, rest): | |
rest = re.sub(namepatt, r"\1%s\2" % aliases[name], rest) | |
if not '*' in full_name: | |
print trueTime, alias, ":", rest | |
else: | |
if rest.endswith("has joined #tikiwiki"): | |
rest = " entered the room." | |
elif "is now known as" in rest: | |
newName = re.search("is now known as ([^\s:]+)", rest) | |
assert(newName) | |
newName = newName.group(1) | |
aliases[newName] = alias | |
rest = rest.replace(newName, alias) | |
print trueTime, alias, "*", rest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment