Skip to content

Instantly share code, notes, and snippets.

@nrrb
Last active October 13, 2015 16:58
Show Gist options
  • Select an option

  • Save nrrb/4227708 to your computer and use it in GitHub Desktop.

Select an option

Save nrrb/4227708 to your computer and use it in GitHub Desktop.
'''
Hi Nick,
In original chat logs, for example, for session 1, we have data for PR like the below:
[12/5/2011 4:03:26 PM] *** Call to Field Specialist Phantom, duration 03:02. ***
[12/5/2011 4:08:39 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:08:41 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:08:43 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:08:44 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:08:45 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:08:48 PM] *** Call to Field Specialist Phantom, no answer. ***
Field Specialist Phantom is on another call. Please try to call again later. Thank you!
[12/5/2011 4:09:14 PM] *** Call from Field Specialist Phantom, duration 00:25. ***
[12/5/2011 4:11:13 PM] *** Call from Field Specialist Phantom, duration 02:00. ***
There are "Call to..." and "Call from...". Pull out the call durations from the chat log, and do the same for PS, SR,SS. Make sure the same call do not appear twice.
Then add all call durations within a phase for each session.
For phase info, you can refer to dropbox\MTS Relational Event Network Modeling\MTS Data\NU Newest Chatlogs\Deduplicated Logs By Phase.
If you have any question, please let me know.
Thanks,
Liwen
'''
import time
import sys
import csv
import re
import os
#datetime_pattern = r'(\d{1,2}\/\d{1,2}\/\d{4}[ ]+\d{1,2}:\d{2}:\d{2}[ ]+\w{2})'
timestamp_pattern = r'(\d{1,2}:\d{2}:\d{2}[ ]+(?:AM|PM))'
duration_pattern = r'([^*]+)[ ]+\*{3} Call (to|from) ([^,]+), duration ((?:\d{1,2}:){1,2}\d{2})\. \*{3}'
filename_pattern = r'(\d{1,2})\-(PR|PS|SR|SS)'
def abbreviate(role_name):
if 'special' in role_name.lower():
position_abbreviation = 'S'
elif 'recon' in role_name.lower():
position_abbreviation = 'R'
else:
position_abbreviation = 'Pos'
if 'sting' in role_name.lower():
team_abbreviation = 'S'
elif 'phant' in role_name.lower():
team_abbreviation = 'P'
else:
team_abbreviation = 'Team'
return team_abbreviation+position_abbreviation
def timestamp_to_seconds(timestamp_string):
# This will process a string like '3:45:33 PM' into the number of seconds that
# have elapsed in the given day, i.e. 56733.
st = time.strptime(timestamp_string, '%I:%M:%S %p')
seconds = st.tm_sec + 60*st.tm_min + 60*60*st.tm_hour
return seconds
def find_durations_in_chatlog(chatlog_path):
durations = []
session_number, player_role = re.search(filename_pattern, chatlog_path.upper()).groups()
with open(chatlog_path, 'rb') as f:
for line in f.readlines():
search = re.search(duration_pattern, line)
if search:
timestamp, direction, conversation_partner, duration = search.groups()
timestamp, = re.search(timestamp_pattern, timestamp).groups()
seconds = timestamp_to_seconds(timestamp)
partner_abbrev = abbreviate(conversation_partner)
durations.append(dict( seconds = seconds,
session_number = session_number,
player_role = player_role,
direction = direction,
partner_abbrev = partner_abbrev,
duration = duration))
return durations
def find_durations_in_chatlogs(root_path):
durations = []
for rootdir, dirnames, filenames in os.walk(root_path):
for filename in filenames:
if filename.split('.')[-1].lower() == 'txt':
# We can get the session number and role of the player from the filename
durations += find_durations_in_chatlog(os.path.join(rootdir, filename))
return durations
if __name__=="__main__":
if len(sys.argv) < 3:
print 'Syntax:'
print 'python extract_call_durations.py /path/to/chat_files /path/to/output.csv'
exit(0)
else:
root_path = sys.argv[1]
with open(sys.argv[2], 'wb') as f:
dict_writer = csv.DictWriter(f, fieldnames=['seconds',
'session_number',
'player_role',
'direction',
'partner_abbrev',
'duration'])
dict_writer.writeheader()
dict_writer.writerows(find_durations_in_chatlogs(root_path))
'''
The goal is to find the total call duration between each pair of people, for
each experimental session.
This script operates on the output CSV from the extract_call_durations.py
script, which extracts call durations from the original 4-Person MTS Skype
chatlog text files. In the original files, we have lines like the following
indicating when a call has been ended:
1-PR 2011-12-5.txt:
[12/5/2011 4:03:26 PM] *** Call to Field Specialist Phantom, duration 03:02. ***
[12/5/2011 4:09:14 PM] *** Call from Field Specialist Phantom, duration 00:25. ***
The above lines would appear in the output CSV as follows:
durations.csv:
session_number,player_role,direction,partner_abbrev,duration
1,PR,to,PS,03:02
1,PR,from,PS,00:25
If the chat logs were perfect, then each call logged in A's logs between A and B
would also appear in B's logs with a matching duration.
This data is not perfect:
1. frequently the symmetric call logs show different durations for the same
call, which so far I've seen as differing by a second.
2. sometimes calls are not recorded symmetrically, i.e. one side's logging of
the call's completion is missing.
Using session 1 as an example, we can extract the following call durations:
durations.csv:
1,PR,from,PS,00:05
1,PR,from,PS,00:25
1,PR,from,PS,00:37
1,PR,from,PS,01:28
1,PR,from,PS,02:00
1,PR,from,SR,01:14
1,PR,from,SR,01:25
1,PR,from,SR,01:51
1,PR,to,PS,00:08
1,PR,to,PS,03:02
1,PS,from,PR,00:08
1,PS,from,PR,03:03
1,PS,to,PR,00:05
1,PS,to,PR,00:26
1,PS,to,PR,00:38
1,PS,to,PR,01:27
1,PS,to,PR,02:00
1,SR,from,SS,02:27
1,SR,to,PR,01:15
1,SR,to,PR,01:25
1,SR,to,PR,01:51
1,SR,to,SS,00:01
1,SS,from,SR,00:01
1,SS,to,SR,02:27
Arranging these side by side:
1 PR from PS 00:25 1 PS to PR 00:26 *
1 PR from PS 02:00 1 PS to PR 02:00
1 PR from PS 01:28 1 PS to PR 01:27 *
1 PR from PS 00:37 1 PS to PR 00:38 *
1 PR from PS 00:05 1 PS to PR 00:05
1 PR from SR 01:51 1 SR to PR 01:51
1 PR from SR 01:25 1 SR to PR 01:25
1 PR from SR 01:14 1 SR to PR 01:15 *
1 PR to PS 03:02 1 PS from PR 03:03 *
1 PR to PS 00:08 1 PS from PR 00:08
1 SR from SS 02:27 1 SS to SR 02:27
1 SR to SS 00:01 1 SS from SR 00:01
It's clear when the data is presented in this manner that the two recorded
durations for the same call may differ by a second, exemplifying data
imperfection point #1. For point #2, let's look at session 2:
durations.csv:
2,PR,from,PS,00:09
2,PR,from,PS,00:13
2,PR,from,PS,00:21
2,PR,from,PS,00:24
2,PR,from,PS,00:37
2,PR,from,SR,00:05
2,PR,to,PS,00:12
2,PR,to,PS,00:24
2,PR,to,SR,00:18
2,PR,to,SR,00:34
2,PR,to,SR,01:30
2,PR,to,SS,00:16
2,PR,to,SS,00:20
2,PS,from,PR,00:13
2,PS,from,PR,00:25
2,PS,to,PR,00:09
2,PS,to,PR,00:13
2,PS,to,PR,00:21
2,PS,to,PR,00:24
2,PS,to,PR,00:37
2,PS,to,SR,00:56
2,PS,to,SR,01:00
2,PS,to,SS,00:11
2,PS,to,SS,00:38
2,PS,to,SS,00:40
2,PS,to,SS,00:42
2,PS,to,SS,00:45
2,SR,from,PR,00:18
2,SR,from,PR,00:34
2,SR,from,PR,01:31
2,SR,from,PS,00:57
2,SR,from,PS,01:00
2,SR,to,PR,00:05
2,SR,to,SS,00:47
2,SS,from,PR,00:17
2,SS,from,PR,00:19
2,SS,from,PS,00:11
2,SS,from,PS,00:38
2,SS,from,PS,00:40
2,SS,from,PS,00:43
2,SS,from,PS,00:46
2,SS,from,SR,00:47
2,SS,to,SR,20:37
Side by side, matching calls:
2 PR from PS 00:24 2 PS to PR 00:24
2 PR from PS 00:13 2 PS to PR 00:13
2 PR from PS 00:37 2 PS to PR 00:37
2 PR from PS 00:09 2 PS to PR 00:09
2 PR from PS 00:21 2 PS to PR 00:21
2 PR from SR 00:05 2 SR to PR 00:05
2 PR to PS 00:24 2 PS from PR 00:25 *
2 PR to PS 00:12 2 PS from PR 00:13 *
2 PR to SR 01:30 2 SR from PR 01:31 *
2 PR to SR 00:34 2 SR from PR 00:34
2 PR to SR 00:18 2 SR from PR 00:18
2 PR to SS 00:20 2 SS from PR 00:19 *
2 PR to SS 00:16 2 SS from PR 00:12 *
2 PS to SR 01:00 2 SR from PS 01:00
2 PS to SR 00:56 2 SR from PS 00:57 *
2 PS to SS 00:45 2 SS from PS 00:46 *
2 PS to SS 00:40 2 SS from PS 00:40
2 PS to SS 00:38 2 SS from PS 00:38
2 PS to SS 00:42 2 SS from PS 00:43 *
2 PS to SS 00:11 2 SS from PS 00:11
2 SR to SS 00:47 2 SS from SR 00:47
2 SS to SR 20:37 ???
Here we see examples of imperfection #1, where matched calls differ in duration
sometimes by a second. We also see an example of imperfection #2, a record of a
call in SS's log from SS to SR lasting 20:37 but no matching record in SR's log
of this call.
If we arrange these durations keyed first by session, then by the person who
made the call (A), then by the person who received the call (B), and then a list
of all call durations when either a call from A appeared in B's log or when
a call to B appeared in A's log, we get the following:
'PR': { 'PS': ['00:12', '00:13', '00:24', '00:25'],
'SR': ['00:18', '00:18', '00:34', '00:34', '01:30', '01:31'],
'SS': ['00:16', '00:17', '00:19', '00:20'] },
'PS': { 'PR': ['00:09', '00:09', '00:13', '00:13', '00:21', '00:21', '00:24',
'00:24', '00:37', '00:37'],
'SR': ['00:56', '00:57', '01:00', '01:00'],
'SS': ['00:11', '00:11', '00:38', '00:38', '00:40', '00:40', '00:42',
'00:43', '00:45', '00:46'] },
'SR': { 'PR': ['00:05', '00:05'],
'SS': ['00:47', '00:47'] },
'SS': { 'SR': ['20:37'] }
To programmatically detect when there are outlier durations like from SS to SR
for 20:37, we can sort each list of durations and alternately add/subtract
successive values. If these were perfectly recorded values, there would
always be an even number of durations per list, and the total after this
alternating add/subtract would be 0. If we apply this method to the durations
for session 2, we get the following:
PR PS -2
PR SR -1
PR SS -2
PS PR 0
PS SR -1
PS SS -2
SR PR 0
SR SS 0
SS SR 1237
Further, if we normalize for the number of pairs of durations being cancelled
out, the discrepancies are as follows:
PR PS -1
PR SR -1
PR SS -1
PS PR 0
PS SR -1
PS SS -1
SR PR 0
SR SS 0
SS SR 1237
If we allow for a discrepancy of -1, 0, or 1 then we find that the durations
are largely consistent in session 2, except for SS to SR. Looking at all
sessions and listing the ones with outlier discrepancies:
2 SS SR 1237 ['20:37']
5 SR SS 1217 ['1:00:52', '29:16', '29:16']
5 SS SR 1216 ['00:05', '00:06', '1:00:51']
7 SR SS 468 ['16:50', '20:16', '26:52']
13 SS SR 3673 ['1:01:13']
26 SR SS 127 ['00:04', '01:43', '08:01']
26 SS SR -102 ['18:16', '21:40']
33 PR PS 1460 ['24:20']
33 PS PR -286 ['00:05', '24:13', '24:13', '25:05', '25:06', '28:39']
33 SR SS 404 ['00:17', '00:17', '01:14', '01:14', '15:11', '15:11', '1:00:40', '25:32', '25:32']
38 PR PS -156 ['09:30', '14:42']
38 SR SS 407 ['20:21', '20:21', '20:22']
38 SS SR 575 ['28:46', '28:46', '28:46']
46 SS PR -12 ['20:36', '20:59']
47 PR PR 1705 ['28:25']
58 SS SR 3632 ['1:00:32']
62 PR SR 4 ['00:05', '00:05', '00:14', '00:14', '00:16', '00:17', '00:44', '21:40', '21:41']
62 PS SR 44 ['00:44']
62 SR PR 10 ['00:06', '00:07', '00:52', '00:57', '00:57']
62 SR PS 52 ['00:52']
65 SR PR -30 ['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
65 SS PR -30 ['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
65 SS SS -48 ['00:46', '01:23', '03:34', '04:30', '10:01', '13:13']
68 PR SR 7 ['00:22', '00:22', '00:22']
68 PS PR 1156 ['57:49', '57:49', '57:49']
69 SS SR 332 ['15:38', '20:11', '21:11']
The sessions above will need to be looked at more closely by hand.
peculiar_sessions = [2, 5, 7, 13, 26, 33, 38, 46, 47, 58, 62, 65, 68, 69]
For the rest of the sessions, the total real duration of calls between two
people A and B is equal to half the total durations logged, since each call
is logged twice (once in A's logs and once in B's).
'''
from collections import defaultdict as dd
import csv
import sys
def is_odd(n):
return (n%2==1)
def duration_to_seconds(duration_string):
chunks = duration_string.split(':')
assert len(chunks)==2 or len(chunks)==3
if len(chunks) == 2:
# minutes:seconds
minutes, seconds = map(int, chunks)
seconds += 60*minutes
elif len(chunks) == 3:
# hours:minutes:seconds
hours, minutes, seconds = map(int, chunks)
seconds += 60*minutes + 60*60*hours
return seconds
if __name__ == "__main__":
if len(sys.argv) < 3:
print 'Syntax:'
print 'python 2-process_durations.py durations.csv total_durations.csv'
exit(1)
input_filename = sys.argv[1]
output_filename = sys.argv[2]
durations_by_session = dd(lambda: dd(lambda: dd(lambda: list())))
with open(input_filename, 'rb') as f:
for row in csv.DictReader(f):
session = row['session_number']
p1 = row['player_role']
p2 = row['partner_abbrev']
duration = duration_to_seconds(row['duration'])
# if row['direction'] == 'from':
# p1, p2 = p2, p1
p1, p2 = sorted([p1, p2])
durations_by_session[session][p1][p2] = sorted(durations_by_session[session][p1][p2] + [duration])
# Find which sessions have peculiar durations logged that can't be handled
# programmatically, so we can skip them in the next loop
clean_sessions = durations_by_session.keys()
for session in durations_by_session:
for p1 in durations_by_session[session]:
for p2 in durations_by_session[session][p1]:
durations = durations_by_session[session][p1][p2]
discrepancy = abs(sum(durations[::2]) - sum(durations[1::2])) / len(durations)
if is_odd(len(durations)) or discrepancy > 1:
print 'Something is fishy with session', session
if session in clean_sessions:
clean_sessions.remove(session)
with open(output_filename, 'wb') as f:
dw = csv.DictWriter(f, fieldnames=['session', 'p1', 'p2', 'total_duration'])
dw.writeheader()
for session in clean_sessions:
for p1 in durations_by_session[session]:
for p2 in durations_by_session[session][p1]:
durations = durations_by_session[session][p1][p2]
# divide by 2 because these durations are recorded twice, once
# in p1's log and once in p2's log
total_duration = sum(durations) / 2
dw.writerow(dict(session=session, p1=p1, p2=p2, total_duration=total_duration))
import time
import csv
import sys
import os
import re
def datestring_to_dayseconds(datestring):
'''
Converts datetime string of the form
12/5/2011-5:47 PM and converts into the
number of seconds that elapsed on the
given day.
'''
timestruct = time.strptime(datestring, '%m/%d/%Y-%I:%M %p')
return 3600*timestruct.tm_hour + 60*timestruct.tm_min + timestruct.tm_sec
phase_markers = [
# TRAIN|Start|12/5/2011-2:13 PM
{ 'name': 'phase1_start',
'pattern': r'TRAIN\|Start\|(.*)'},
# TRAIN|End|12/5/2011-3:39 PM
{ 'name': 'phase1_end',
'pattern': r'TRAIN\|End\|(.*)'},
# MISSN|1|Start:00:00-12/5/2011-4:01 PM
{ 'name': 'phase2_start',
'pattern': r'MISSN\|1\|Start\:\d{2}\:\d{2}\-(.*)'},
# MISSN|1|End:22:50-12/5/2011-4:23 PM
{ 'name': 'phase2_end',
'pattern': r'MISSN\|1\|End\:\d{2}\:\d{2}\-(.*)'},
# MISSN|2|Start:00:00-12/5/2011-4:54 PM
{ 'name': 'phase3_start',
'pattern': r'MISSN\|2\|Start\:\d{2}\:\d{2}\-(.*)'},
# MISSN|2|End:47:25-12/5/2011-5:47 PM
{ 'name': 'phase3_end',
'pattern': r'MISSN\|2\|End\:\d{2}\:\d{2}\-(.*)'}]
if __name__=="__main__":
if len(sys.argv) < 3:
print 'Syntax:'
print 'python 3-phase_timeranges.py /path/to/original/server/logfiles/ phase_markers.csv'
exit(1)
path = sys.argv[1]
output_filename = sys.argv[2]
data = dict()
for rootdir, directories, filenames in os.walk(path):
for filename in filenames:
if filename.lower() == 'logserver.txt':
session = int(rootdir.split('/')[-1].split('-')[0])
data[session] = dict()
with open(os.path.join(rootdir, filename), 'rb') as f:
lines = [line.strip() for line in f.readlines()]
for phase in phase_markers:
try:
line_match = filter(lambda line: re.search(phase['pattern'], line), lines)[0]
datestring = re.search(phase['pattern'], line_match).groups()[0]
seconds = datestring_to_dayseconds(datestring)
data[session][phase['name']] = seconds
except:
print os.path.join(rootdir, filename)
with open(os.path.join(output_filename), 'wb') as f:
dw = csv.DictWriter(f, fieldnames=['session'] + [phase['name'] for phase in phase_markers])
dw.writeheader()
for session in data:
data[session]['session'] = session
dw.writerow(data[session])
import csv
import sys
if __name__=="__main__":
if len(sys.argv) < 4:
print 'Syntax:'
print 'python 4-durations_with_phase.py durations.csv phase_markers.csv durations_with_phase.csv'
exit(1)
input_durations_filename = sys.argv[1]
input_phase_filename = sys.argv[2]
output_filename = sys.argv[3]
with open(input_durations_filename, 'rb') as f:
durations = list(csv.DictReader(f))
with open(input_phase_filename, 'rb') as f:
phase_markers = list(csv.DictReader(f))
for duration in durations:
markers = filter(lambda x: x['session']==duration['session_number'], phase_markers)[0]
seconds = int(duration['seconds'])
if seconds >= int(markers['phase1_start']) and seconds < int(markers['phase2_start']):
phase = '1'
elif seconds >= int(markers['phase2_start']) and seconds < int(markers['phase3_start']):
phase = '2'
elif seconds >= int(markers['phase3_start']) and seconds <= int(markers['phase3_end']):
phase = '3'
else:
phase = 'none'
duration['phase'] = phase
duration.update(markers)
with open(output_filename, 'wb') as f:
dw = csv.DictWriter(f, fieldnames=sorted(durations[0].keys()))
dw.writeheader()
dw.writerows(durations)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment