nrrb · October 13, 2015 16:58
diff --git a/1-extract_call_durations.py b/1-extract_call_durations.py
 '''
 Hi Nick,

 In original chat logs, for example, for session 1, we have data for PR like the below:

 [12/5/2011 4:03:26 PM] *** Call to Field Specialist Phantom, duration 03:02. ***
 [12/5/2011 4:08:39 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:08:41 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:08:43 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:08:44 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:08:45 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:08:48 PM] *** Call to Field Specialist Phantom, no answer. ***
 Field Specialist Phantom is on another call. Please try to call again later. Thank you!
 [12/5/2011 4:09:14 PM] *** Call from Field Specialist Phantom, duration 00:25. ***
 [12/5/2011 4:11:13 PM] *** Call from Field Specialist Phantom, duration 02:00. ***

 There are "Call to..." and "Call from...". Pull out the call durations from the chat log, and do the same for PS, SR,SS. Make sure the same call do not appear twice.
 Then add all call durations within a phase for each session. 
 For phase info, you can refer to dropbox\MTS Relational Event Network Modeling\MTS Data\NU Newest Chatlogs\Deduplicated Logs By Phase.

 If you have any question, please let me know.

 Thanks,
 Liwen
 '''

 import time
 import sys
 import csv
 import re
 import os

 #datetime_pattern = r'(\d{1,2}\/\d{1,2}\/\d{4}[ ]+\d{1,2}:\d{2}:\d{2}[ ]+\w{2})'
 timestamp_pattern = r'(\d{1,2}:\d{2}:\d{2}[ ]+(?:AM|PM))'
 duration_pattern = r'([^*]+)[ ]+\*{3} Call (to|from) ([^,]+), duration ((?:\d{1,2}:){1,2}\d{2})\. \*{3}'
 filename_pattern = r'(\d{1,2})\-(PR|PS|SR|SS)'

 def abbreviate(role_name):
 	if 'special' in role_name.lower():
 		position_abbreviation = 'S'
 	elif 'recon' in role_name.lower():
 		position_abbreviation = 'R'
 	else:
 		position_abbreviation = 'Pos'
 	if 'sting' in role_name.lower():
 		team_abbreviation = 'S'
 	elif 'phant' in role_name.lower():
 		team_abbreviation = 'P'
 	else:
 		team_abbreviation = 'Team'
 	return team_abbreviation+position_abbreviation

 def timestamp_to_seconds(timestamp_string):
 	# This will process a string like '3:45:33 PM' into the number of seconds that
 	# have elapsed in the given day, i.e. 56733. 
 	st = time.strptime(timestamp_string, '%I:%M:%S %p')
 	seconds = st.tm_sec + 60*st.tm_min + 60*60*st.tm_hour
 	return seconds


 def find_durations_in_chatlog(chatlog_path):
 	durations = []
 	session_number, player_role = re.search(filename_pattern, chatlog_path.upper()).groups()	

 	with open(chatlog_path, 'rb') as f:
 		for line in f.readlines():
 			search = re.search(duration_pattern, line)
 			if search:
 				timestamp, direction, conversation_partner, duration = search.groups()
 				timestamp, = re.search(timestamp_pattern, timestamp).groups()
 				seconds = timestamp_to_seconds(timestamp)
 				partner_abbrev = abbreviate(conversation_partner)
 				durations.append(dict(	seconds = seconds,
 										session_number = session_number, 
 										player_role = player_role, 
 										direction = direction,
 										partner_abbrev = partner_abbrev,
 										duration = duration))
 	return durations

 def find_durations_in_chatlogs(root_path):
 	durations = []
 	for rootdir, dirnames, filenames in os.walk(root_path):
 		for filename in filenames:
 			if filename.split('.')[-1].lower() == 'txt':
 				# We can get the session number and role of the player from the filename
 				durations += find_durations_in_chatlog(os.path.join(rootdir, filename))
 	return durations

 if __name__=="__main__":
 	if len(sys.argv) < 3:
 		print 'Syntax:'
 		print 'python extract_call_durations.py /path/to/chat_files /path/to/output.csv'
 		exit(0)
 	else:
 		root_path = sys.argv[1]
 		with open(sys.argv[2], 'wb') as f:
 			dict_writer = csv.DictWriter(f, fieldnames=['seconds',
 														'session_number', 
 														'player_role', 
 														'direction', 
 														'partner_abbrev',
 														'duration'])
 			dict_writer.writeheader()
 			dict_writer.writerows(find_durations_in_chatlogs(root_path))

 	
diff --git a/2-process_durations.py b/2-process_durations.py
 '''
 The goal is to find the total call duration between each pair of people, for 
 each experimental session. 

 This script operates on the output CSV from the extract_call_durations.py 
 script, which extracts call durations from the original 4-Person MTS Skype 
 chatlog text files. In the original files, we have lines like the following 
 indicating when a call has been ended:

 1-PR 2011-12-5.txt:
 	[12/5/2011 4:03:26 PM] *** Call to Field Specialist Phantom, duration 03:02. ***
 	[12/5/2011 4:09:14 PM] *** Call from Field Specialist Phantom, duration 00:25. ***

 The above lines would appear in the output CSV as follows:

 durations.csv:
 	session_number,player_role,direction,partner_abbrev,duration
 	1,PR,to,PS,03:02
 	1,PR,from,PS,00:25

 If the chat logs were perfect, then each call logged in A's logs between A and B
 would also appear in B's logs with a matching duration. 

 This data is not perfect:
 1.	frequently the symmetric call logs show different durations for the same 
 	call, which so far I've seen as differing by a second.
 2. 	sometimes calls are not recorded symmetrically, i.e. one side's logging of 
 	the call's completion is missing.

 Using session 1 as an example, we can extract the following call durations:

 durations.csv:
 	1,PR,from,PS,00:05
 	1,PR,from,PS,00:25
 	1,PR,from,PS,00:37
 	1,PR,from,PS,01:28
 	1,PR,from,PS,02:00
 	1,PR,from,SR,01:14
 	1,PR,from,SR,01:25
 	1,PR,from,SR,01:51
 	1,PR,to,PS,00:08
 	1,PR,to,PS,03:02
 	1,PS,from,PR,00:08
 	1,PS,from,PR,03:03
 	1,PS,to,PR,00:05
 	1,PS,to,PR,00:26
 	1,PS,to,PR,00:38
 	1,PS,to,PR,01:27
 	1,PS,to,PR,02:00
 	1,SR,from,SS,02:27
 	1,SR,to,PR,01:15
 	1,SR,to,PR,01:25
 	1,SR,to,PR,01:51
 	1,SR,to,SS,00:01
 	1,SS,from,SR,00:01
 	1,SS,to,SR,02:27	

 Arranging these side by side:

 	1	PR	from	PS	00:25		1	PS	to		PR	00:26	*
 	1	PR	from	PS	02:00		1	PS	to		PR	02:00
 	1	PR	from	PS	01:28		1	PS	to		PR	01:27	*
 	1	PR	from	PS	00:37		1	PS	to		PR	00:38	*
 	1	PR	from	PS	00:05		1	PS	to		PR	00:05
 	1	PR	from	SR	01:51		1	SR	to		PR	01:51
 	1	PR	from	SR	01:25		1	SR	to		PR	01:25
 	1	PR	from	SR	01:14		1	SR	to		PR	01:15 	*
 	1	PR	to		PS	03:02		1	PS	from	PR	03:03 	*
 	1	PR	to		PS	00:08		1	PS	from	PR	00:08
 	1	SR	from	SS	02:27		1	SS	to		SR	02:27
 	1	SR	to		SS	00:01		1	SS	from	SR	00:01

 It's clear when the data is presented in this manner that the two recorded 
 durations for the same call may differ by a second, exemplifying data 
 imperfection point #1. For point #2, let's look at session 2:

 durations.csv:
 	2,PR,from,PS,00:09
 	2,PR,from,PS,00:13
 	2,PR,from,PS,00:21
 	2,PR,from,PS,00:24
 	2,PR,from,PS,00:37
 	2,PR,from,SR,00:05
 	2,PR,to,PS,00:12
 	2,PR,to,PS,00:24
 	2,PR,to,SR,00:18
 	2,PR,to,SR,00:34
 	2,PR,to,SR,01:30
 	2,PR,to,SS,00:16
 	2,PR,to,SS,00:20
 	2,PS,from,PR,00:13
 	2,PS,from,PR,00:25
 	2,PS,to,PR,00:09
 	2,PS,to,PR,00:13
 	2,PS,to,PR,00:21
 	2,PS,to,PR,00:24
 	2,PS,to,PR,00:37
 	2,PS,to,SR,00:56
 	2,PS,to,SR,01:00
 	2,PS,to,SS,00:11
 	2,PS,to,SS,00:38
 	2,PS,to,SS,00:40
 	2,PS,to,SS,00:42
 	2,PS,to,SS,00:45
 	2,SR,from,PR,00:18
 	2,SR,from,PR,00:34
 	2,SR,from,PR,01:31
 	2,SR,from,PS,00:57
 	2,SR,from,PS,01:00
 	2,SR,to,PR,00:05
 	2,SR,to,SS,00:47
 	2,SS,from,PR,00:17
 	2,SS,from,PR,00:19
 	2,SS,from,PS,00:11
 	2,SS,from,PS,00:38
 	2,SS,from,PS,00:40
 	2,SS,from,PS,00:43
 	2,SS,from,PS,00:46
 	2,SS,from,SR,00:47
 	2,SS,to,SR,20:37

 Side by side, matching calls:

 	2	PR	from	PS	00:24		2	PS	to		PR	00:24
 	2	PR	from	PS	00:13		2	PS	to		PR	00:13
 	2	PR	from	PS	00:37		2	PS	to		PR	00:37
 	2	PR	from	PS	00:09		2	PS	to		PR	00:09
 	2	PR	from	PS	00:21		2	PS	to		PR	00:21
 	2	PR	from	SR	00:05		2	SR	to		PR	00:05
 	2	PR	to		PS	00:24		2	PS	from	PR	00:25 	*
 	2	PR	to		PS	00:12		2	PS	from	PR	00:13 	*
 	2	PR	to		SR	01:30		2	SR	from	PR	01:31 	*
 	2	PR	to		SR	00:34		2	SR	from	PR	00:34
 	2	PR	to		SR	00:18		2	SR	from	PR	00:18
 	2	PR	to		SS	00:20		2	SS	from	PR	00:19 	*
 	2	PR	to		SS	00:16		2	SS	from	PR	00:12 	*
 	2	PS	to		SR	01:00		2	SR	from	PS	01:00
 	2	PS	to		SR	00:56		2	SR	from	PS	00:57	*
 	2	PS	to		SS	00:45		2	SS	from	PS	00:46	*
 	2	PS	to		SS	00:40		2	SS	from	PS	00:40
 	2	PS	to		SS	00:38		2	SS	from	PS	00:38
 	2	PS	to		SS	00:42		2	SS	from	PS	00:43	*
 	2	PS	to		SS	00:11		2	SS	from	PS	00:11
 	2	SR	to		SS	00:47		2	SS	from	SR	00:47	
 	2	SS	to		SR	20:37									???

 Here we see examples of imperfection #1, where matched calls differ in duration 
 sometimes by a second. We also see an example of imperfection #2, a record of a 
 call in SS's log from SS to SR lasting 20:37 but no matching record in SR's log
 of this call. 


 If we arrange these durations keyed first by session, then by the person who
 made the call (A), then by the person who received the call (B), and then a list 
 of all call durations when either a call from A appeared in B's log or when
 a call to B appeared in A's log, we get the following:

 'PR': {	'PS': ['00:12', '00:13', '00:24', '00:25'],
 		'SR': ['00:18', '00:18', '00:34', '00:34', '01:30', '01:31'],
 		'SS': ['00:16', '00:17', '00:19', '00:20']	},
 'PS': {	'PR': ['00:09', '00:09', '00:13', '00:13', '00:21', '00:21', '00:24', 
 				'00:24', '00:37', '00:37'],
 		'SR': ['00:56', '00:57', '01:00', '01:00'],
 		'SS': ['00:11', '00:11', '00:38', '00:38', '00:40', '00:40', '00:42', 
 				'00:43', '00:45', '00:46']	},
 'SR': {	'PR': ['00:05', '00:05'],
 		'SS': ['00:47', '00:47']	},
 'SS': {	'SR': ['20:37']	}

 To programmatically detect when there are outlier durations like from SS to SR
 for 20:37, we can sort each list of durations and alternately add/subtract 
 successive values. If these were perfectly recorded values, there would 
 always be an even number of durations per list, and the total after this 
 alternating add/subtract would be 0. If we apply this method to the durations
 for session 2, we get the following:

 PR PS -2
 PR SR -1
 PR SS -2
 PS PR 0
 PS SR -1
 PS SS -2
 SR PR 0
 SR SS 0
 SS SR 1237

 Further, if we normalize for the number of pairs of durations being cancelled 
 out, the discrepancies are as follows:

 PR PS -1
 PR SR -1
 PR SS -1
 PS PR 0
 PS SR -1
 PS SS -1
 SR PR 0
 SR SS 0
 SS SR 1237

 If we allow for a discrepancy of -1, 0, or 1 then we find that the durations
 are largely consistent in session 2, except for SS to SR. Looking at all 
 sessions and listing the ones with outlier discrepancies:

 2 	SS 	SR 	1237 	['20:37']
 5 	SR 	SS 	1217 	['1:00:52', '29:16', '29:16']
 5 	SS 	SR 	1216 	['00:05', '00:06', '1:00:51']
 7 	SR 	SS 	468 	['16:50', '20:16', '26:52']
 13	SS 	SR 	3673 	['1:01:13']
 26	SR 	SS 	127 	['00:04', '01:43', '08:01']
 26	SS 	SR 	-102 	['18:16', '21:40']
 33	PR 	PS 	1460 	['24:20']
 33	PS 	PR 	-286 	['00:05', '24:13', '24:13', '25:05', '25:06', '28:39']
 33	SR 	SS 	404 	['00:17', '00:17', '01:14', '01:14', '15:11', '15:11', '1:00:40', '25:32', '25:32']
 38	PR 	PS 	-156 	['09:30', '14:42']
 38	SR 	SS 	407 	['20:21', '20:21', '20:22']
 38	SS 	SR 	575 	['28:46', '28:46', '28:46']
 46	SS 	PR 	-12 	['20:36', '20:59']
 47	PR 	PR 	1705 	['28:25']
 58	SS 	SR 	3632 	['1:00:32']
 62	PR 	SR 	4 		['00:05', '00:05', '00:14', '00:14', '00:16', '00:17', '00:44', '21:40', '21:41']
 62	PS 	SR 	44 		['00:44']
 62	SR 	PR 	10 		['00:06', '00:07', '00:52', '00:57', '00:57']
 62	SR 	PS 	52 		['00:52']
 65	SR 	PR 	-30 	['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
 65	SS 	PR 	-30 	['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
 65	SS 	SS 	-48 	['00:46', '01:23', '03:34', '04:30', '10:01', '13:13']
 68	PR 	SR 	7 		['00:22', '00:22', '00:22']
 68	PS 	PR 	1156 	['57:49', '57:49', '57:49']
 69	SS 	SR 	332 	['15:38', '20:11', '21:11']

 The sessions above will need to be looked at more closely by hand. 

 peculiar_sessions = [2, 5, 7, 13, 26, 33, 38, 46, 47, 58, 62, 65, 68, 69]

 For the rest of the sessions, the total real duration of calls between two 
 people A and B is equal to half the total durations logged, since each call
 is logged twice (once in A's logs and once in B's). 
 '''
 from collections import defaultdict as dd
 import csv
 import sys

 def is_odd(n):
 	return (n%2==1)

 def duration_to_seconds(duration_string):
 	chunks = duration_string.split(':')
 	assert len(chunks)==2 or len(chunks)==3
 	if len(chunks) == 2:
 		# minutes:seconds
 		minutes, seconds = map(int, chunks)
 		seconds += 60*minutes
 	elif len(chunks) == 3:
 		# hours:minutes:seconds
 		hours, minutes, seconds = map(int, chunks)
 		seconds += 60*minutes + 60*60*hours
 	return seconds

 if __name__ == "__main__":
 	if len(sys.argv) < 3:
 		print 'Syntax:'
 		print 'python 2-process_durations.py durations.csv total_durations.csv'
 		exit(1)
 	
  input_filename = sys.argv[1]
  output_filename = sys.argv[2]
  
 	durations_by_session = dd(lambda: dd(lambda: dd(lambda: list())))

 	with open(input_filename, 'rb') as f:
 		for row in csv.DictReader(f):
 			session = row['session_number']
 			p1 = row['player_role']
 			p2 = row['partner_abbrev']
 			duration = duration_to_seconds(row['duration'])
 			# if row['direction'] == 'from':
 			# 	p1, p2 = p2, p1
 			p1, p2 = sorted([p1, p2])
 			durations_by_session[session][p1][p2] = sorted(durations_by_session[session][p1][p2] + [duration])

 	# Find which sessions have peculiar durations logged that can't be handled
 	# programmatically, so we can skip them in the next loop
 	clean_sessions = durations_by_session.keys()
 	for session in durations_by_session:
 		for p1 in durations_by_session[session]:
 			for p2 in durations_by_session[session][p1]:
 				durations = durations_by_session[session][p1][p2]
 				discrepancy = abs(sum(durations[::2]) - sum(durations[1::2])) / len(durations)
 				if is_odd(len(durations)) or discrepancy > 1:
 					print 'Something is fishy with session', session
 					if session in clean_sessions:
 						clean_sessions.remove(session)

 	with open(output_filename, 'wb') as f:
 		dw = csv.DictWriter(f, fieldnames=['session', 'p1', 'p2', 'total_duration'])
 		dw.writeheader()
 		for session in clean_sessions:
 			for p1 in durations_by_session[session]:
 				for p2 in durations_by_session[session][p1]:
 					durations = durations_by_session[session][p1][p2]
 					# divide by 2 because these durations are recorded twice, once
 					# in p1's log and once in p2's log
 					total_duration = sum(durations) / 2
 					dw.writerow(dict(session=session, p1=p1, p2=p2, total_duration=total_duration))
diff --git a/3-phase_timeranges.py b/3-phase_timeranges.py
 import time
 import csv
 import sys
 import os
 import re

 def datestring_to_dayseconds(datestring):
  '''
  Converts datetime string of the form 
  12/5/2011-5:47 PM and converts into the
  number of seconds that elapsed on the 
  given day.
  '''
  timestruct = time.strptime(datestring, '%m/%d/%Y-%I:%M %p')
  return 3600*timestruct.tm_hour + 60*timestruct.tm_min + timestruct.tm_sec

 phase_markers = [
 # TRAIN|Start|12/5/2011-2:13 PM
  { 'name': 'phase1_start',
    'pattern': r'TRAIN\|Start\|(.*)'},
 # TRAIN|End|12/5/2011-3:39 PM
  { 'name': 'phase1_end',
    'pattern': r'TRAIN\|End\|(.*)'},
 # MISSN|1|Start:00:00-12/5/2011-4:01 PM
  { 'name': 'phase2_start',
    'pattern': r'MISSN\|1\|Start\:\d{2}\:\d{2}\-(.*)'},
 # MISSN|1|End:22:50-12/5/2011-4:23 PM
  { 'name': 'phase2_end',
    'pattern': r'MISSN\|1\|End\:\d{2}\:\d{2}\-(.*)'},
 # MISSN|2|Start:00:00-12/5/2011-4:54 PM
  { 'name': 'phase3_start',
    'pattern': r'MISSN\|2\|Start\:\d{2}\:\d{2}\-(.*)'},
 # MISSN|2|End:47:25-12/5/2011-5:47 PM
  { 'name': 'phase3_end',
    'pattern': r'MISSN\|2\|End\:\d{2}\:\d{2}\-(.*)'}]

 if __name__=="__main__":
  if len(sys.argv) < 3:
    print 'Syntax:'
    print 'python 3-phase_timeranges.py /path/to/original/server/logfiles/ phase_markers.csv'
    exit(1)
  
  path = sys.argv[1]
  output_filename = sys.argv[2]
  
  data = dict()
  for rootdir, directories, filenames in os.walk(path):
    for filename in filenames:
      if filename.lower() == 'logserver.txt':
        session = int(rootdir.split('/')[-1].split('-')[0])
        data[session] = dict()
        with open(os.path.join(rootdir, filename), 'rb') as f:
          lines = [line.strip() for line in f.readlines()]
          for phase in phase_markers:
            try:
              line_match = filter(lambda line: re.search(phase['pattern'], line), lines)[0]
              datestring = re.search(phase['pattern'], line_match).groups()[0]
              seconds = datestring_to_dayseconds(datestring)
              data[session][phase['name']] = seconds
            except:
              print os.path.join(rootdir, filename)
  
  with open(os.path.join(output_filename), 'wb') as f:
    dw = csv.DictWriter(f, fieldnames=['session'] + [phase['name'] for phase in phase_markers])
    dw.writeheader()
    for session in data:
      data[session]['session'] = session
      dw.writerow(data[session])
diff --git a/4-durations_with_phase.py b/4-durations_with_phase.py
 import csv
 import sys

 if __name__=="__main__":
  if len(sys.argv) < 4:
    print 'Syntax:'
    print 'python 4-durations_with_phase.py durations.csv phase_markers.csv durations_with_phase.csv'
    exit(1)
    
  input_durations_filename = sys.argv[1]
  input_phase_filename = sys.argv[2]
  output_filename = sys.argv[3]
  
  with open(input_durations_filename, 'rb') as f:
    durations = list(csv.DictReader(f))  

  with open(input_phase_filename, 'rb') as f:
    phase_markers = list(csv.DictReader(f))

  for duration in durations:
    markers = filter(lambda x: x['session']==duration['session_number'], phase_markers)[0]
    seconds = int(duration['seconds'])
    if seconds >= int(markers['phase1_start']) and seconds < int(markers['phase2_start']):
      phase = '1'
    elif seconds >= int(markers['phase2_start']) and seconds < int(markers['phase3_start']):
      phase = '2'
    elif seconds >= int(markers['phase3_start']) and seconds <= int(markers['phase3_end']):
      phase = '3'
    else:
        phase = 'none'
    duration['phase'] = phase
    duration.update(markers)

  with open(output_filename, 'wb') as f:
    dw = csv.DictWriter(f, fieldnames=sorted(durations[0].keys()))
    dw.writeheader()
    dw.writerows(durations)
	'''
	Hi Nick,

	In original chat logs, for example, for session 1, we have data for PR like the below:

	[12/5/2011 4:03:26 PM] * Call to Field Specialist Phantom, duration 03:02. *
	[12/5/2011 4:08:39 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:08:41 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:08:43 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:08:44 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:08:45 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:08:48 PM] * Call to Field Specialist Phantom, no answer. *
	Field Specialist Phantom is on another call. Please try to call again later. Thank you!
	[12/5/2011 4:09:14 PM] * Call from Field Specialist Phantom, duration 00:25. *
	[12/5/2011 4:11:13 PM] * Call from Field Specialist Phantom, duration 02:00. *

	There are "Call to..." and "Call from...". Pull out the call durations from the chat log, and do the same for PS, SR,SS. Make sure the same call do not appear twice.
	Then add all call durations within a phase for each session.
	For phase info, you can refer to dropbox\MTS Relational Event Network Modeling\MTS Data\NU Newest Chatlogs\Deduplicated Logs By Phase.

	If you have any question, please let me know.

	Thanks,
	Liwen
	'''

	import time
	import sys
	import csv
	import re
	import os

	#datetime_pattern = r'(\d{1,2}\/\d{1,2}\/\d{4}[ ]+\d{1,2}:\d{2}:\d{2}[ ]+\w{2})'
	timestamp_pattern = r'(\d{1,2}:\d{2}:\d{2}[ ]+(?:AM\|PM))'
	duration_pattern = r'([^]+)[ ]+\{3} Call (to\|from) ([^,]+), duration ((?:\d{1,2}:){1,2}\d{2})\. \*{3}'
	filename_pattern = r'(\d{1,2})\-(PR\|PS\|SR\|SS)'

	def abbreviate(role_name):
	if 'special' in role_name.lower():
	position_abbreviation = 'S'
	elif 'recon' in role_name.lower():
	position_abbreviation = 'R'
	else:
	position_abbreviation = 'Pos'
	if 'sting' in role_name.lower():
	team_abbreviation = 'S'
	elif 'phant' in role_name.lower():
	team_abbreviation = 'P'
	else:
	team_abbreviation = 'Team'
	return team_abbreviation+position_abbreviation

	def timestamp_to_seconds(timestamp_string):
	# This will process a string like '3:45:33 PM' into the number of seconds that
	# have elapsed in the given day, i.e. 56733.
	st = time.strptime(timestamp_string, '%I:%M:%S %p')
	seconds = st.tm_sec + 60st.tm_min + 6060*st.tm_hour
	return seconds


	def find_durations_in_chatlog(chatlog_path):
	durations = []
	session_number, player_role = re.search(filename_pattern, chatlog_path.upper()).groups()

	with open(chatlog_path, 'rb') as f:
	for line in f.readlines():
	search = re.search(duration_pattern, line)
	if search:
	timestamp, direction, conversation_partner, duration = search.groups()
	timestamp, = re.search(timestamp_pattern, timestamp).groups()
	seconds = timestamp_to_seconds(timestamp)
	partner_abbrev = abbreviate(conversation_partner)
	durations.append(dict( seconds = seconds,
	session_number = session_number,
	player_role = player_role,
	direction = direction,
	partner_abbrev = partner_abbrev,
	duration = duration))
	return durations

	def find_durations_in_chatlogs(root_path):
	durations = []
	for rootdir, dirnames, filenames in os.walk(root_path):
	for filename in filenames:
	if filename.split('.')[-1].lower() == 'txt':
	# We can get the session number and role of the player from the filename
	durations += find_durations_in_chatlog(os.path.join(rootdir, filename))
	return durations

	if __name__=="__main__":
	if len(sys.argv) < 3:
	print 'Syntax:'
	print 'python extract_call_durations.py /path/to/chat_files /path/to/output.csv'
	exit(0)
	else:
	root_path = sys.argv[1]
	with open(sys.argv[2], 'wb') as f:
	dict_writer = csv.DictWriter(f, fieldnames=['seconds',
	'session_number',
	'player_role',
	'direction',
	'partner_abbrev',
	'duration'])
	dict_writer.writeheader()
	dict_writer.writerows(find_durations_in_chatlogs(root_path))
	'''
	The goal is to find the total call duration between each pair of people, for
	each experimental session.

	This script operates on the output CSV from the extract_call_durations.py
	script, which extracts call durations from the original 4-Person MTS Skype
	chatlog text files. In the original files, we have lines like the following
	indicating when a call has been ended:

	1-PR 2011-12-5.txt:
	[12/5/2011 4:03:26 PM] * Call to Field Specialist Phantom, duration 03:02. *
	[12/5/2011 4:09:14 PM] * Call from Field Specialist Phantom, duration 00:25. *

	The above lines would appear in the output CSV as follows:

	durations.csv:
	session_number,player_role,direction,partner_abbrev,duration
	1,PR,to,PS,03:02
	1,PR,from,PS,00:25

	If the chat logs were perfect, then each call logged in A's logs between A and B
	would also appear in B's logs with a matching duration.

	This data is not perfect:
	1. frequently the symmetric call logs show different durations for the same
	call, which so far I've seen as differing by a second.
	2. sometimes calls are not recorded symmetrically, i.e. one side's logging of
	the call's completion is missing.

	Using session 1 as an example, we can extract the following call durations:

	durations.csv:
	1,PR,from,PS,00:05
	1,PR,from,PS,00:25
	1,PR,from,PS,00:37
	1,PR,from,PS,01:28
	1,PR,from,PS,02:00
	1,PR,from,SR,01:14
	1,PR,from,SR,01:25
	1,PR,from,SR,01:51
	1,PR,to,PS,00:08
	1,PR,to,PS,03:02
	1,PS,from,PR,00:08
	1,PS,from,PR,03:03
	1,PS,to,PR,00:05
	1,PS,to,PR,00:26
	1,PS,to,PR,00:38
	1,PS,to,PR,01:27
	1,PS,to,PR,02:00
	1,SR,from,SS,02:27
	1,SR,to,PR,01:15
	1,SR,to,PR,01:25
	1,SR,to,PR,01:51
	1,SR,to,SS,00:01
	1,SS,from,SR,00:01
	1,SS,to,SR,02:27

	Arranging these side by side:

	1 PR from PS 00:25 1 PS to PR 00:26 *
	1 PR from PS 02:00 1 PS to PR 02:00
	1 PR from PS 01:28 1 PS to PR 01:27 *
	1 PR from PS 00:37 1 PS to PR 00:38 *
	1 PR from PS 00:05 1 PS to PR 00:05
	1 PR from SR 01:51 1 SR to PR 01:51
	1 PR from SR 01:25 1 SR to PR 01:25
	1 PR from SR 01:14 1 SR to PR 01:15 *
	1 PR to PS 03:02 1 PS from PR 03:03 *
	1 PR to PS 00:08 1 PS from PR 00:08
	1 SR from SS 02:27 1 SS to SR 02:27
	1 SR to SS 00:01 1 SS from SR 00:01

	It's clear when the data is presented in this manner that the two recorded
	durations for the same call may differ by a second, exemplifying data
	imperfection point #1. For point #2, let's look at session 2:

	durations.csv:
	2,PR,from,PS,00:09
	2,PR,from,PS,00:13
	2,PR,from,PS,00:21
	2,PR,from,PS,00:24
	2,PR,from,PS,00:37
	2,PR,from,SR,00:05
	2,PR,to,PS,00:12
	2,PR,to,PS,00:24
	2,PR,to,SR,00:18
	2,PR,to,SR,00:34
	2,PR,to,SR,01:30
	2,PR,to,SS,00:16
	2,PR,to,SS,00:20
	2,PS,from,PR,00:13
	2,PS,from,PR,00:25
	2,PS,to,PR,00:09
	2,PS,to,PR,00:13
	2,PS,to,PR,00:21
	2,PS,to,PR,00:24
	2,PS,to,PR,00:37
	2,PS,to,SR,00:56
	2,PS,to,SR,01:00
	2,PS,to,SS,00:11
	2,PS,to,SS,00:38
	2,PS,to,SS,00:40
	2,PS,to,SS,00:42
	2,PS,to,SS,00:45
	2,SR,from,PR,00:18
	2,SR,from,PR,00:34
	2,SR,from,PR,01:31
	2,SR,from,PS,00:57
	2,SR,from,PS,01:00
	2,SR,to,PR,00:05
	2,SR,to,SS,00:47
	2,SS,from,PR,00:17
	2,SS,from,PR,00:19
	2,SS,from,PS,00:11
	2,SS,from,PS,00:38
	2,SS,from,PS,00:40
	2,SS,from,PS,00:43
	2,SS,from,PS,00:46
	2,SS,from,SR,00:47
	2,SS,to,SR,20:37

	Side by side, matching calls:

	2 PR from PS 00:24 2 PS to PR 00:24
	2 PR from PS 00:13 2 PS to PR 00:13
	2 PR from PS 00:37 2 PS to PR 00:37
	2 PR from PS 00:09 2 PS to PR 00:09
	2 PR from PS 00:21 2 PS to PR 00:21
	2 PR from SR 00:05 2 SR to PR 00:05
	2 PR to PS 00:24 2 PS from PR 00:25 *
	2 PR to PS 00:12 2 PS from PR 00:13 *
	2 PR to SR 01:30 2 SR from PR 01:31 *
	2 PR to SR 00:34 2 SR from PR 00:34
	2 PR to SR 00:18 2 SR from PR 00:18
	2 PR to SS 00:20 2 SS from PR 00:19 *
	2 PR to SS 00:16 2 SS from PR 00:12 *
	2 PS to SR 01:00 2 SR from PS 01:00
	2 PS to SR 00:56 2 SR from PS 00:57 *
	2 PS to SS 00:45 2 SS from PS 00:46 *
	2 PS to SS 00:40 2 SS from PS 00:40
	2 PS to SS 00:38 2 SS from PS 00:38
	2 PS to SS 00:42 2 SS from PS 00:43 *
	2 PS to SS 00:11 2 SS from PS 00:11
	2 SR to SS 00:47 2 SS from SR 00:47
	2 SS to SR 20:37 ???

	Here we see examples of imperfection #1, where matched calls differ in duration
	sometimes by a second. We also see an example of imperfection #2, a record of a
	call in SS's log from SS to SR lasting 20:37 but no matching record in SR's log
	of this call.


	If we arrange these durations keyed first by session, then by the person who
	made the call (A), then by the person who received the call (B), and then a list
	of all call durations when either a call from A appeared in B's log or when
	a call to B appeared in A's log, we get the following:

	'PR': { 'PS': ['00:12', '00:13', '00:24', '00:25'],
	'SR': ['00:18', '00:18', '00:34', '00:34', '01:30', '01:31'],
	'SS': ['00:16', '00:17', '00:19', '00:20'] },
	'PS': { 'PR': ['00:09', '00:09', '00:13', '00:13', '00:21', '00:21', '00:24',
	'00:24', '00:37', '00:37'],
	'SR': ['00:56', '00:57', '01:00', '01:00'],
	'SS': ['00:11', '00:11', '00:38', '00:38', '00:40', '00:40', '00:42',
	'00:43', '00:45', '00:46'] },
	'SR': { 'PR': ['00:05', '00:05'],
	'SS': ['00:47', '00:47'] },
	'SS': { 'SR': ['20:37'] }

	To programmatically detect when there are outlier durations like from SS to SR
	for 20:37, we can sort each list of durations and alternately add/subtract
	successive values. If these were perfectly recorded values, there would
	always be an even number of durations per list, and the total after this
	alternating add/subtract would be 0. If we apply this method to the durations
	for session 2, we get the following:

	PR PS -2
	PR SR -1
	PR SS -2
	PS PR 0
	PS SR -1
	PS SS -2
	SR PR 0
	SR SS 0
	SS SR 1237

	Further, if we normalize for the number of pairs of durations being cancelled
	out, the discrepancies are as follows:

	PR PS -1
	PR SR -1
	PR SS -1
	PS PR 0
	PS SR -1
	PS SS -1
	SR PR 0
	SR SS 0
	SS SR 1237

	If we allow for a discrepancy of -1, 0, or 1 then we find that the durations
	are largely consistent in session 2, except for SS to SR. Looking at all
	sessions and listing the ones with outlier discrepancies:

	2 SS SR 1237 ['20:37']
	5 SR SS 1217 ['1:00:52', '29:16', '29:16']
	5 SS SR 1216 ['00:05', '00:06', '1:00:51']
	7 SR SS 468 ['16:50', '20:16', '26:52']
	13 SS SR 3673 ['1:01:13']
	26 SR SS 127 ['00:04', '01:43', '08:01']
	26 SS SR -102 ['18:16', '21:40']
	33 PR PS 1460 ['24:20']
	33 PS PR -286 ['00:05', '24:13', '24:13', '25:05', '25:06', '28:39']
	33 SR SS 404 ['00:17', '00:17', '01:14', '01:14', '15:11', '15:11', '1:00:40', '25:32', '25:32']
	38 PR PS -156 ['09:30', '14:42']
	38 SR SS 407 ['20:21', '20:21', '20:22']
	38 SS SR 575 ['28:46', '28:46', '28:46']
	46 SS PR -12 ['20:36', '20:59']
	47 PR PR 1705 ['28:25']
	58 SS SR 3632 ['1:00:32']
	62 PR SR 4 ['00:05', '00:05', '00:14', '00:14', '00:16', '00:17', '00:44', '21:40', '21:41']
	62 PS SR 44 ['00:44']
	62 SR PR 10 ['00:06', '00:07', '00:52', '00:57', '00:57']
	62 SR PS 52 ['00:52']
	65 SR PR -30 ['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
	65 SS PR -30 ['00:16', '00:17', '00:31', '03:03', '04:24', '04:49']
	65 SS SS -48 ['00:46', '01:23', '03:34', '04:30', '10:01', '13:13']
	68 PR SR 7 ['00:22', '00:22', '00:22']
	68 PS PR 1156 ['57:49', '57:49', '57:49']
	69 SS SR 332 ['15:38', '20:11', '21:11']

	The sessions above will need to be looked at more closely by hand.

	peculiar_sessions = [2, 5, 7, 13, 26, 33, 38, 46, 47, 58, 62, 65, 68, 69]

	For the rest of the sessions, the total real duration of calls between two
	people A and B is equal to half the total durations logged, since each call
	is logged twice (once in A's logs and once in B's).
	'''
	from collections import defaultdict as dd
	import csv
	import sys

	def is_odd(n):
	return (n%2==1)

	def duration_to_seconds(duration_string):
	chunks = duration_string.split(':')
	assert len(chunks)==2 or len(chunks)==3
	if len(chunks) == 2:
	# minutes:seconds
	minutes, seconds = map(int, chunks)
	seconds += 60*minutes
	elif len(chunks) == 3:
	# hours:minutes:seconds
	hours, minutes, seconds = map(int, chunks)
	seconds += 60minutes + 6060*hours
	return seconds

	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print 'Syntax:'
	print 'python 2-process_durations.py durations.csv total_durations.csv'
	exit(1)

	input_filename = sys.argv[1]
	output_filename = sys.argv[2]

	durations_by_session = dd(lambda: dd(lambda: dd(lambda: list())))

	with open(input_filename, 'rb') as f:
	for row in csv.DictReader(f):
	session = row['session_number']
	p1 = row['player_role']
	p2 = row['partner_abbrev']
	duration = duration_to_seconds(row['duration'])
	# if row['direction'] == 'from':
	# p1, p2 = p2, p1
	p1, p2 = sorted([p1, p2])
	durations_by_session[session][p1][p2] = sorted(durations_by_session[session][p1][p2] + [duration])

	# Find which sessions have peculiar durations logged that can't be handled
	# programmatically, so we can skip them in the next loop
	clean_sessions = durations_by_session.keys()
	for session in durations_by_session:
	for p1 in durations_by_session[session]:
	for p2 in durations_by_session[session][p1]:
	durations = durations_by_session[session][p1][p2]
	discrepancy = abs(sum(durations[::2]) - sum(durations[1::2])) / len(durations)
	if is_odd(len(durations)) or discrepancy > 1:
	print 'Something is fishy with session', session
	if session in clean_sessions:
	clean_sessions.remove(session)

	with open(output_filename, 'wb') as f:
	dw = csv.DictWriter(f, fieldnames=['session', 'p1', 'p2', 'total_duration'])
	dw.writeheader()
	for session in clean_sessions:
	for p1 in durations_by_session[session]:
	for p2 in durations_by_session[session][p1]:
	durations = durations_by_session[session][p1][p2]
	# divide by 2 because these durations are recorded twice, once
	# in p1's log and once in p2's log
	total_duration = sum(durations) / 2
	dw.writerow(dict(session=session, p1=p1, p2=p2, total_duration=total_duration))