Created
February 27, 2014 23:42
-
-
Save tbelaire/9262124 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import collections | |
import itertools | |
def munge_timestamp(timestamp): | |
return timestamp | |
def parse_srt(srt_string): | |
"""Return json version for storing in datastore""" | |
timestamp_regex = re.compile( | |
r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?' | |
r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))') | |
caption_json = [] | |
# I only need the queue part | |
line_queue = collections.deque(srt_string.splitlines()) | |
while line_queue: | |
# Srt files number each entry, but we ignore that. | |
_num_line = line_queue.popleft() | |
time_line = line_queue.popleft() | |
timestamp_match = timestamp_regex.search(time_line) | |
start_time = timestamp_match.group('start_time') | |
end_time = timestamp_match.group('end_time') | |
body = [] | |
while True: | |
body_line = line_queue.popleft() | |
if not body_line: | |
break | |
body.append(body_line) | |
caption_json.append({ | |
'start_time': munge_timestamp(start_time), | |
'end_time': munge_timestamp(end_time), | |
'text': '\n'.join(body), | |
# 'ka_is_valid': True | |
}) | |
return caption_json | |
def parse_srt_f(srt_string): | |
"""Return json version for storing in datastore""" | |
timestamp_regex = re.compile( | |
r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?' | |
r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))') | |
def line_to_json(_number, timestamp, *body): | |
"""Convert a block of lines into one caption entry | |
Srt files number each entry, but we ignore that. | |
Ex. | |
line_to_json(*['1', | |
'00:00:00,000 --> 00:00:04,090', | |
'I know what you're thinking', | |
'Sal, additoin doesn't seem so basic to me.']) | |
=> | |
{ | |
'start_time':..., | |
'end_time':..., | |
'text': 'I know what you're thinking\n' | |
'Sal, additoin doesn't seem so basic to me.', | |
'ka_is_valid': True | |
} | |
""" | |
timestamp_match = timestamp_regex.search(timestamp) | |
start_time = timestamp_match.group('start_time') | |
end_time = timestamp_match.group('end_time') | |
return { | |
'start_time': munge_timestamp(start_time), | |
'end_time': munge_timestamp(end_time), | |
'text': '\n'.join(body), | |
'ka_is_valid': True | |
} | |
# groupby groups blocks of lines separated by blank lines | |
# since bool("") = False, and bool(s) is true for other strings | |
blocks = itertools.groupby(srt_string.splitlines(), bool) | |
return [line_to_json(*block) | |
for non_blank, block in blocks if non_blank] | |
def _test_srt_file(filename): | |
with open(filename, 'r') as f: | |
text = f.read() | |
print text | |
return parse_srt_f(text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment