Created
November 13, 2012 23:21
-
-
Save nrrb/4069099 to your computer and use it in GitHub Desktop.
Deduplicating Skype Chat Logs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import glob | |
import re | |
from collections import defaultdict as dd | |
src_path = "C:\\Users\\Nick\\Dropbox\\MTS Relational Event Network Modeling\\MTS Data\\NU Newest Chatlogs" | |
# Read in all data from all the files and index it in a data structure by session, | |
# sender, receiver, and the message. This will aid in finding the duplicates later. | |
# We want to be able to use data like this: | |
# data['1']['PR']['PS']['hey you'] = [] | |
data = dd(lambda: dd(lambda: dd(lambda: dd(lambda: [])))) | |
for filename in glob.glob(os.path.join(src_path, "*.txt")): | |
if not re.search(r'(\d+)\.', filename): | |
# This is not the droid we're looking for | |
continue | |
with open(filename, 'rb') as f: | |
session = re.search(r'(\d+)\.', filename).groups()[0] | |
for row in csv.DictReader(f, delimiter='\t'): | |
data[session][row['sender']][row['receiver']][row['message']].append(row) | |
# If the sender, receiver, and message are the same, then discard all | |
# but the last message when ordered by timestamp | |
for session in data: | |
for sender in data[session]: | |
for receiver in data[session][sender]: | |
for msg in data[session][sender][receiver]: | |
messages = data[session][sender][receiver][msg] | |
if len(messages) > 1: | |
messages = sorted(messages, lambda x,y: cmp(x['time'], y['time'])) | |
# Here we throw away all messages but the last | |
messages = messages[-1:] | |
data[session][sender][receiver][msg] = messages | |
# Collect all the messages remaining and write to files | |
for session in data: | |
session_data = [] | |
for s in data[session]: | |
for r in data[session][s]: | |
for m in data[session][s][r]: | |
session_data += data[session][s][r][m] | |
session_data = sorted(session_data, lambda x,y: cmp(x['time'], y['time'])) | |
with open(os.path.join(src_path, '%s-deduplicated.txt'%session), 'wb') as f: | |
dw = csv.DictWriter(f, delimiter='\t', fieldnames=['session', 'sender','receiver','time','length','message']) | |
dw.writeheader() | |
dw.writerows(session_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment