DrDub · April 22, 2013 01:53
diff --git a/stripChatTikiNonAnon.py b/stripChatTikiNonAnon.py
 #!/usr/bin/env python

 #converts a gaim chatlog to a more ethical anonymized version
 #format of the output is
 #[datestamp timestamp] <name> comment
 #or
 #[datestamp timestamp] *** name action

 from random import shuffle
 from sys import argv
 import re
 #from time import *
 import datetime

 def timeToSecs(timeStr, prevTime):
    t = datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M")
    seconds = long((t-datetime.datetime(1970,1,1)).total_seconds())
    #(year,month,day,hr,min) = timeTuple[1:5]
    while seconds <= prevTime:
        seconds += 1
    return seconds

 #read the names list (got from the US census and preprocessed a little bit)
 nameFile = "data/names"
 names = [x.rstrip().title() for x in file(nameFile).readlines()]
 shuffle(names)

 #load up the file
 chatFile = argv[1]
 chat = file(chatFile)

 print "Processing", chatFile

 aliases = {}

 #the intro line (my name, server id, true start time)
 #intro = chat.readline()
 #trueTime = re.search("at \S+ ([\d:]+)", intro)
 #assert(trueTime)
 #trueTime = timeToSecs(trueTime.group(1), 0)

 #channelName = re.search("Conversation with (\S+)", intro)
 #assert(channelName)
 channelName = "tiki" #channelName.group(1)
 aliases[channelName] = channelName

 #epoch = trueTime
 trueTime = 0

 basicRE = re.compile("\[([^\]]+)\] ((<([^>]+)>)|\*\*?\*? ([^\s]+))(.*)")

 for line in chat:
    match = basicRE.match(line)
    assert(match)
    (time, full_name, name_comment, ignore, action_name, rest) = match.groups()
    name = name_comment if name_comment else action_name

    #print line, "time", time, "full_name", full_name, "name_comment", name_comment, "action_name", action_name, "ignore", ignore, "rest", rest

    trueTime = timeToSecs(time, trueTime)
    
    try:
        alias = aliases[name]
    except KeyError:
        alias = names.pop()
        aliases[name] = alias

    if "is now known as" not in rest:
        #obnoxiously, people can readopt others' nicknames
        #and then you get cross-aliasing
        for name in aliases.keys():
            if name in rest:
                namepatt = re.compile("(^|[^a-zA-Z]+)%s([^a-zA-Z]+|$)" %
                                       re.escape(name))
                if re.search(namepatt, rest):
                    rest = re.sub(namepatt, r"\1%s\2" % aliases[name], rest)

    if not '*' in full_name:
        print trueTime, alias, ":", rest                
    else:
        if rest.endswith("has joined #tikiwiki"):
            rest = " entered the room."
        elif "is now known as" in rest:
            newName = re.search("is now known as ([^\s:]+)", rest)
            assert(newName)
            newName = newName.group(1)
            aliases[newName] = alias
            rest = rest.replace(newName, alias)
        print trueTime, alias, "*", rest
	#!/usr/bin/env python

	#converts a gaim chatlog to a more ethical anonymized version
	#format of the output is
	#[datestamp timestamp] <name> comment
	#or
	#[datestamp timestamp] *** name action

	from random import shuffle
	from sys import argv
	import re
	#from time import *
	import datetime

	def timeToSecs(timeStr, prevTime):
	t = datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M")
	seconds = long((t-datetime.datetime(1970,1,1)).total_seconds())
	#(year,month,day,hr,min) = timeTuple[1:5]
	while seconds <= prevTime:
	seconds += 1
	return seconds

	#read the names list (got from the US census and preprocessed a little bit)
	nameFile = "data/names"
	names = [x.rstrip().title() for x in file(nameFile).readlines()]
	shuffle(names)

	#load up the file
	chatFile = argv[1]
	chat = file(chatFile)

	print "Processing", chatFile

	aliases = {}

	#the intro line (my name, server id, true start time)
	#intro = chat.readline()
	#trueTime = re.search("at \S+ ([\d:]+)", intro)
	#assert(trueTime)
	#trueTime = timeToSecs(trueTime.group(1), 0)

	#channelName = re.search("Conversation with (\S+)", intro)
	#assert(channelName)
	channelName = "tiki" #channelName.group(1)
	aliases[channelName] = channelName

	#epoch = trueTime
	trueTime = 0

	basicRE = re.compile("\[([^\]]+)\] ((<([^>]+)>)\|\\?\? ([^\s]+))(.)")

	for line in chat:
	match = basicRE.match(line)
	assert(match)
	(time, full_name, name_comment, ignore, action_name, rest) = match.groups()
	name = name_comment if name_comment else action_name

	#print line, "time", time, "full_name", full_name, "name_comment", name_comment, "action_name", action_name, "ignore", ignore, "rest", rest

	trueTime = timeToSecs(time, trueTime)

	try:
	alias = aliases[name]
	except KeyError:
	alias = names.pop()
	aliases[name] = alias

	if "is now known as" not in rest:
	#obnoxiously, people can readopt others' nicknames
	#and then you get cross-aliasing
	for name in aliases.keys():
	if name in rest:
	namepatt = re.compile("(^\|[^a-zA-Z]+)%s([^a-zA-Z]+\|$)" %
	re.escape(name))
	if re.search(namepatt, rest):
	rest = re.sub(namepatt, r"\1%s\2" % aliases[name], rest)

	if not '*' in full_name:
	print trueTime, alias, ":", rest
	else:
	if rest.endswith("has joined #tikiwiki"):
	rest = " entered the room."
	elif "is now known as" in rest:
	newName = re.search("is now known as ([^\s:]+)", rest)
	assert(newName)
	newName = newName.group(1)
	aliases[newName] = alias
	rest = rest.replace(newName, alias)
	print trueTime, alias, "*", rest
No results found