Skip to content

Instantly share code, notes, and snippets.

@tbelaire
Created February 27, 2014 23:42
Show Gist options
  • Save tbelaire/9262124 to your computer and use it in GitHub Desktop.
Save tbelaire/9262124 to your computer and use it in GitHub Desktop.
import re
import collections
import itertools
def munge_timestamp(timestamp):
return timestamp
def parse_srt(srt_string):
"""Return json version for storing in datastore"""
timestamp_regex = re.compile(
r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')
caption_json = []
# I only need the queue part
line_queue = collections.deque(srt_string.splitlines())
while line_queue:
# Srt files number each entry, but we ignore that.
_num_line = line_queue.popleft()
time_line = line_queue.popleft()
timestamp_match = timestamp_regex.search(time_line)
start_time = timestamp_match.group('start_time')
end_time = timestamp_match.group('end_time')
body = []
while True:
body_line = line_queue.popleft()
if not body_line:
break
body.append(body_line)
caption_json.append({
'start_time': munge_timestamp(start_time),
'end_time': munge_timestamp(end_time),
'text': '\n'.join(body),
# 'ka_is_valid': True
})
return caption_json
def parse_srt_f(srt_string):
"""Return json version for storing in datastore"""
timestamp_regex = re.compile(
r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')
def line_to_json(_number, timestamp, *body):
"""Convert a block of lines into one caption entry
Srt files number each entry, but we ignore that.
Ex.
line_to_json(*['1',
'00:00:00,000 --> 00:00:04,090',
'I know what you're thinking',
'Sal, additoin doesn't seem so basic to me.'])
=>
{
'start_time':...,
'end_time':...,
'text': 'I know what you're thinking\n'
'Sal, additoin doesn't seem so basic to me.',
'ka_is_valid': True
}
"""
timestamp_match = timestamp_regex.search(timestamp)
start_time = timestamp_match.group('start_time')
end_time = timestamp_match.group('end_time')
return {
'start_time': munge_timestamp(start_time),
'end_time': munge_timestamp(end_time),
'text': '\n'.join(body),
'ka_is_valid': True
}
# groupby groups blocks of lines separated by blank lines
# since bool("") = False, and bool(s) is true for other strings
blocks = itertools.groupby(srt_string.splitlines(), bool)
return [line_to_json(*block)
for non_blank, block in blocks if non_blank]
def _test_srt_file(filename):
with open(filename, 'r') as f:
text = f.read()
print text
return parse_srt_f(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment