Skip to content

Instantly share code, notes, and snippets.

@nrrb
Created November 13, 2012 23:21
Show Gist options
  • Save nrrb/4069099 to your computer and use it in GitHub Desktop.
Save nrrb/4069099 to your computer and use it in GitHub Desktop.
Deduplicating Skype Chat Logs
import csv
import os
import glob
import re
from collections import defaultdict as dd
src_path = "C:\\Users\\Nick\\Dropbox\\MTS Relational Event Network Modeling\\MTS Data\\NU Newest Chatlogs"
# Read in all data from all the files and index it in a data structure by session,
# sender, receiver, and the message. This will aid in finding the duplicates later.
# We want to be able to use data like this:
# data['1']['PR']['PS']['hey you'] = []
data = dd(lambda: dd(lambda: dd(lambda: dd(lambda: []))))
for filename in glob.glob(os.path.join(src_path, "*.txt")):
if not re.search(r'(\d+)\.', filename):
# This is not the droid we're looking for
continue
with open(filename, 'rb') as f:
session = re.search(r'(\d+)\.', filename).groups()[0]
for row in csv.DictReader(f, delimiter='\t'):
data[session][row['sender']][row['receiver']][row['message']].append(row)
# If the sender, receiver, and message are the same, then discard all
# but the last message when ordered by timestamp
for session in data:
for sender in data[session]:
for receiver in data[session][sender]:
for msg in data[session][sender][receiver]:
messages = data[session][sender][receiver][msg]
if len(messages) > 1:
messages = sorted(messages, lambda x,y: cmp(x['time'], y['time']))
# Here we throw away all messages but the last
messages = messages[-1:]
data[session][sender][receiver][msg] = messages
# Collect all the messages remaining and write to files
for session in data:
session_data = []
for s in data[session]:
for r in data[session][s]:
for m in data[session][s][r]:
session_data += data[session][s][r][m]
session_data = sorted(session_data, lambda x,y: cmp(x['time'], y['time']))
with open(os.path.join(src_path, '%s-deduplicated.txt'%session), 'wb') as f:
dw = csv.DictWriter(f, delimiter='\t', fieldnames=['session', 'sender','receiver','time','length','message'])
dw.writeheader()
dw.writerows(session_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment