tbelaire · February 27, 2014 23:42
diff --git a/parse_srt.py b/parse_srt.py
 import re
 import collections
 import itertools


 def munge_timestamp(timestamp):
    return timestamp

 def parse_srt(srt_string):
        """Return json version for storing in datastore"""
        timestamp_regex = re.compile(
            r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
            r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')
        caption_json = []

        # I only need the queue part
        line_queue = collections.deque(srt_string.splitlines())
        while line_queue:
            # Srt files number each entry, but we ignore that.
            _num_line = line_queue.popleft()

            time_line = line_queue.popleft()

            timestamp_match = timestamp_regex.search(time_line)
            start_time = timestamp_match.group('start_time')
            end_time = timestamp_match.group('end_time')

            body = []
            while True:
                body_line = line_queue.popleft()
                if not body_line: 
                    break
                body.append(body_line)

            caption_json.append({
                    'start_time': munge_timestamp(start_time),
                    'end_time': munge_timestamp(end_time),
                    'text': '\n'.join(body),
                    # 'ka_is_valid': True
                 })
        return caption_json

 def parse_srt_f(srt_string):
        """Return json version for storing in datastore"""
        timestamp_regex = re.compile(
            r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
            r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')

        def line_to_json(_number, timestamp, *body):
            """Convert a block of lines into one caption entry

            Srt files number each entry, but we ignore that.
            Ex.
            line_to_json(*['1',
                           '00:00:00,000 --> 00:00:04,090',
                           'I know what you're thinking',
                           'Sal, additoin doesn't seem so basic to me.'])
            =>
            {
              'start_time':...,
              'end_time':...,
              'text':  'I know what you're thinking\n'
                       'Sal, additoin doesn't seem so basic to me.',
              'ka_is_valid': True
            }
            """
            timestamp_match = timestamp_regex.search(timestamp)
            start_time = timestamp_match.group('start_time')
            end_time = timestamp_match.group('end_time')
            return {
                'start_time': munge_timestamp(start_time),
                'end_time': munge_timestamp(end_time),
                'text': '\n'.join(body),
                'ka_is_valid': True
            }
        # groupby groups blocks of lines separated by blank lines
        # since bool("") = False, and bool(s) is true for other strings
        blocks = itertools.groupby(srt_string.splitlines(), bool)
        return [line_to_json(*block)
                for non_blank, block in blocks if non_blank]

 def _test_srt_file(filename):
    with open(filename, 'r') as f:
        text = f.read()
        print text
        return parse_srt_f(text)
	import re
	import collections
	import itertools


	def munge_timestamp(timestamp):
	return timestamp

	def parse_srt(srt_string):
	"""Return json version for storing in datastore"""
	timestamp_regex = re.compile(
	r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
	r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')
	caption_json = []

	# I only need the queue part
	line_queue = collections.deque(srt_string.splitlines())
	while line_queue:
	# Srt files number each entry, but we ignore that.
	_num_line = line_queue.popleft()

	time_line = line_queue.popleft()

	timestamp_match = timestamp_regex.search(time_line)
	start_time = timestamp_match.group('start_time')
	end_time = timestamp_match.group('end_time')

	body = []
	while True:
	body_line = line_queue.popleft()
	if not body_line:
	break
	body.append(body_line)

	caption_json.append({
	'start_time': munge_timestamp(start_time),
	'end_time': munge_timestamp(end_time),
	'text': '\n'.join(body),
	# 'ka_is_valid': True
	})
	return caption_json

	def parse_srt_f(srt_string):
	"""Return json version for storing in datastore"""
	timestamp_regex = re.compile(
	r'((?P<start_time>\d\d:\d\d:\d\d,\d\d\d)?'
	r' --> (?P<end_time>\d\d:\d\d:\d\d,\d\d\d))')

	def line_to_json(_number, timestamp, *body):
	"""Convert a block of lines into one caption entry

	Srt files number each entry, but we ignore that.
	Ex.
	line_to_json(*['1',
	'00:00:00,000 --> 00:00:04,090',
	'I know what you're thinking',
	'Sal, additoin doesn't seem so basic to me.'])
	=>
	{
	'start_time':...,
	'end_time':...,
	'text': 'I know what you're thinking\n'
	'Sal, additoin doesn't seem so basic to me.',
	'ka_is_valid': True
	}
	"""
	timestamp_match = timestamp_regex.search(timestamp)
	start_time = timestamp_match.group('start_time')
	end_time = timestamp_match.group('end_time')
	return {
	'start_time': munge_timestamp(start_time),
	'end_time': munge_timestamp(end_time),
	'text': '\n'.join(body),
	'ka_is_valid': True
	}
	# groupby groups blocks of lines separated by blank lines
	# since bool("") = False, and bool(s) is true for other strings
	blocks = itertools.groupby(srt_string.splitlines(), bool)
	return [line_to_json(*block)
	for non_blank, block in blocks if non_blank]

	def _test_srt_file(filename):
	with open(filename, 'r') as f:
	text = f.read()
	print text
	return parse_srt_f(text)