bmschmidt · September 30, 2016 01:52
diff --git a/pySRT.py b/pySRT.py
 import re
 import gzip
 import sys

 def stripBadText(string):
    if string==None:
        return ""
    # No html tags
    string = re.sub("<[^>]+>","",string)
    # People don't talk in [brackets] or (inside parentheses), so I strip them.
    string = re.sub(r"\[([^\]]+)\]","",string)
    string = re.sub(r"\(([^\)]+)\)","",string)
    # ellipses can be used to separate one set of tokens from another.
    string = string.replace("..."," ")
    # And the use of dancy music theme things is irregular.
    string = string.replace("\xe2"," ")
    return string
    
 class srtBlock(str):
    """
    A string of text, initialized from an srt block,
    which parses the metadata out into separate methods.
    """
    def __init__(self,blockArray):
        self.blocknum = blockArray.pop(0)
        self.timestamp = blockArray.pop(0)
        text = " ".join([stripBadText(string) for string in blockArray])
        super(srtBlock,self).__init__(text)
        self.text = text
        
    def time(self,which="start"):
        metadata = dict()
        try:
            timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
            if which=="start":
                time = timeHits[0]
            elif which=="end":
                time = timeHits[1]
        except IndexError:
            sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
            return -1
        value = int(time[0])*60*60 + int(time[1])*60 + int(time[2])
        #value = int(value/chunkSize)*chunkSize/60
        return value

    def textContent(self):
        text = self.text
        return text

 class srtGroup(object):

    """
    An object that is initialized with an array of srt filenames:
    It builds them as a group into a set of dictionaries that includes
    minute, percentage, twelfth of the way through information as well as
    the text.
    """
    
    def __init__(self,files):
        self.targets = files
        self.files = [srtFile(file) for file in files]
        
    def strings(self):
        totalLength = sum([file.totalLength for file in self.files])
        self.totalLength = totalLength
        seenSoFar=0
        chunks = dict()
        for file in self.files:
            for block in file:
                try:
                    key = seenSoFar + block.time()
                except TypeError:
                    print self.targets
                    print seenSoFar
                    print block.time()
                    raise
                try:
                    chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
                except KeyError:
                    chunks[key] = dict()
                    chunks[key]['second'] = key
                    chunks[key]['text'] = block.text
                    
            seenSoFar = seenSoFar + file.totalLength
        return [chunks[key] for key in chunks.keys()]

    def documents(self,minChunk=120):
        chunks = dict()
        num = 0
        for string in self.strings():
            num += 1
            i=3
            going = True
            while going:
                #THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
                key = nameChunk(i)
                try:
                    string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1
                except ZeroDivisionError:
                    print string['text']
                    raise
                if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
                    string['maxChunk'] = key
                    string['minute'] = int(string['second']/60)
                    going = False
                i=i*2
            
            key = string[key]
            try:
                chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
            except KeyError:
                chunks[key] = dict()
                chunks[key]['text'] = string['text']
            for key2 in string.keys():
                if key2 != "text":
                    chunks[key][key2] = string[key2]
            del string['second']
            
        return [chunks[key] for key in chunks.keys()]
                
 def nameChunk(i):
    if i==3:
        return "3rd"
    else:
        return str(i) + "th"

 class srtFile(object):
    def __init__(self,filename):
        self.filename=filename
        self.source = gzip.open(filename,"r")
        self.readBlocks()

    def readBlocks(self):
        blocks = "".join(self.source.readlines())

        blocks = re.sub(r"\r\n","\n",blocks)
        blocks = blocks.split("\n\n")

        self.blocks = [block.split("\n") for block in blocks]

        self.array = []
        while len(self.blocks) > 0:
            thisBlock = self.blocks.pop()
            if len(thisBlock)>1:
                theBlock = (srtBlock(thisBlock))
                if theBlock.time() > -1:
                    self.array.append(theBlock)
        if (len(self.array) > 0):
            self.totalLength = self.array[0].time()
        else:
            self.totalLength = -1

    def __iter__(self):
        return self
   
    def next(self):
        try:
            return self.array.pop()
        except IndexError:
            raise StopIteration
	import re
	import gzip
	import sys

	def stripBadText(string):
	if string==None:
	return ""
	# No html tags
	string = re.sub("<[^>]+>","",string)
	# People don't talk in [brackets] or (inside parentheses), so I strip them.
	string = re.sub(r"\[([^\]]+)\]","",string)
	string = re.sub(r"\(([^\)]+)\)","",string)
	# ellipses can be used to separate one set of tokens from another.
	string = string.replace("..."," ")
	# And the use of dancy music theme things is irregular.
	string = string.replace("\xe2"," ")
	return string

	class srtBlock(str):
	"""
	A string of text, initialized from an srt block,
	which parses the metadata out into separate methods.
	"""
	def __init__(self,blockArray):
	self.blocknum = blockArray.pop(0)
	self.timestamp = blockArray.pop(0)
	text = " ".join([stripBadText(string) for string in blockArray])
	super(srtBlock,self).__init__(text)
	self.text = text

	def time(self,which="start"):
	metadata = dict()
	try:
	timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
	if which=="start":
	time = timeHits[0]
	elif which=="end":
	time = timeHits[1]
	except IndexError:
	sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
	return -1
	value = int(time[0])6060 + int(time[1])*60 + int(time[2])
	#value = int(value/chunkSize)*chunkSize/60
	return value

	def textContent(self):
	text = self.text
	return text

	class srtGroup(object):

	"""
	An object that is initialized with an array of srt filenames:
	It builds them as a group into a set of dictionaries that includes
	minute, percentage, twelfth of the way through information as well as
	the text.
	"""

	def __init__(self,files):
	self.targets = files
	self.files = [srtFile(file) for file in files]

	def strings(self):
	totalLength = sum([file.totalLength for file in self.files])
	self.totalLength = totalLength
	seenSoFar=0
	chunks = dict()
	for file in self.files:
	for block in file:
	try:
	key = seenSoFar + block.time()
	except TypeError:
	print self.targets
	print seenSoFar
	print block.time()
	raise
	try:
	chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
	except KeyError:
	chunks[key] = dict()
	chunks[key]['second'] = key
	chunks[key]['text'] = block.text

	seenSoFar = seenSoFar + file.totalLength
	return [chunks[key] for key in chunks.keys()]

	def documents(self,minChunk=120):
	chunks = dict()
	num = 0
	for string in self.strings():
	num += 1
	i=3
	going = True
	while going:
	#THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
	key = nameChunk(i)
	try:
	string[key]=int((string['second']i)/(self.totalLength1.000001)) + 1
	except ZeroDivisionError:
	print string['text']
	raise
	if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
	string['maxChunk'] = key
	string['minute'] = int(string['second']/60)
	going = False
	i=i*2

	key = string[key]
	try:
	chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
	except KeyError:
	chunks[key] = dict()
	chunks[key]['text'] = string['text']
	for key2 in string.keys():
	if key2 != "text":
	chunks[key][key2] = string[key2]
	del string['second']

	return [chunks[key] for key in chunks.keys()]

	def nameChunk(i):
	if i==3:
	return "3rd"
	else:
	return str(i) + "th"

	class srtFile(object):
	def __init__(self,filename):
	self.filename=filename
	self.source = gzip.open(filename,"r")
	self.readBlocks()

	def readBlocks(self):
	blocks = "".join(self.source.readlines())

	blocks = re.sub(r"\r\n","\n",blocks)
	blocks = blocks.split("\n\n")

	self.blocks = [block.split("\n") for block in blocks]

	self.array = []
	while len(self.blocks) > 0:
	thisBlock = self.blocks.pop()
	if len(thisBlock)>1:
	theBlock = (srtBlock(thisBlock))
	if theBlock.time() > -1:
	self.array.append(theBlock)
	if (len(self.array) > 0):
	self.totalLength = self.array[0].time()
	else:
	self.totalLength = -1

	def __iter__(self):
	return self

	def next(self):
	try:
	return self.array.pop()
	except IndexError:
	raise StopIteration