Skip to content

Instantly share code, notes, and snippets.

@bmschmidt
Created September 30, 2016 01:52
Show Gist options
  • Save bmschmidt/07dfa3dba5dff5a71727fcd3c0618d24 to your computer and use it in GitHub Desktop.
Save bmschmidt/07dfa3dba5dff5a71727fcd3c0618d24 to your computer and use it in GitHub Desktop.
import re
import gzip
import sys
def stripBadText(string):
if string==None:
return ""
# No html tags
string = re.sub("<[^>]+>","",string)
# People don't talk in [brackets] or (inside parentheses), so I strip them.
string = re.sub(r"\[([^\]]+)\]","",string)
string = re.sub(r"\(([^\)]+)\)","",string)
# ellipses can be used to separate one set of tokens from another.
string = string.replace("..."," ")
# And the use of dancy music theme things is irregular.
string = string.replace("\xe2"," ")
return string
class srtBlock(str):
"""
A string of text, initialized from an srt block,
which parses the metadata out into separate methods.
"""
def __init__(self,blockArray):
self.blocknum = blockArray.pop(0)
self.timestamp = blockArray.pop(0)
text = " ".join([stripBadText(string) for string in blockArray])
super(srtBlock,self).__init__(text)
self.text = text
def time(self,which="start"):
metadata = dict()
try:
timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp)
if which=="start":
time = timeHits[0]
elif which=="end":
time = timeHits[1]
except IndexError:
sys.stderr.write("Couldn't parse " + self.timestamp + "\n")
return -1
value = int(time[0])*60*60 + int(time[1])*60 + int(time[2])
#value = int(value/chunkSize)*chunkSize/60
return value
def textContent(self):
text = self.text
return text
class srtGroup(object):
"""
An object that is initialized with an array of srt filenames:
It builds them as a group into a set of dictionaries that includes
minute, percentage, twelfth of the way through information as well as
the text.
"""
def __init__(self,files):
self.targets = files
self.files = [srtFile(file) for file in files]
def strings(self):
totalLength = sum([file.totalLength for file in self.files])
self.totalLength = totalLength
seenSoFar=0
chunks = dict()
for file in self.files:
for block in file:
try:
key = seenSoFar + block.time()
except TypeError:
print self.targets
print seenSoFar
print block.time()
raise
try:
chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text
except KeyError:
chunks[key] = dict()
chunks[key]['second'] = key
chunks[key]['text'] = block.text
seenSoFar = seenSoFar + file.totalLength
return [chunks[key] for key in chunks.keys()]
def documents(self,minChunk=120):
chunks = dict()
num = 0
for string in self.strings():
num += 1
i=3
going = True
while going:
#THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold.
key = nameChunk(i)
try:
string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1
except ZeroDivisionError:
print string['text']
raise
if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks.
string['maxChunk'] = key
string['minute'] = int(string['second']/60)
going = False
i=i*2
key = string[key]
try:
chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text']
except KeyError:
chunks[key] = dict()
chunks[key]['text'] = string['text']
for key2 in string.keys():
if key2 != "text":
chunks[key][key2] = string[key2]
del string['second']
return [chunks[key] for key in chunks.keys()]
def nameChunk(i):
if i==3:
return "3rd"
else:
return str(i) + "th"
class srtFile(object):
def __init__(self,filename):
self.filename=filename
self.source = gzip.open(filename,"r")
self.readBlocks()
def readBlocks(self):
blocks = "".join(self.source.readlines())
blocks = re.sub(r"\r\n","\n",blocks)
blocks = blocks.split("\n\n")
self.blocks = [block.split("\n") for block in blocks]
self.array = []
while len(self.blocks) > 0:
thisBlock = self.blocks.pop()
if len(thisBlock)>1:
theBlock = (srtBlock(thisBlock))
if theBlock.time() > -1:
self.array.append(theBlock)
if (len(self.array) > 0):
self.totalLength = self.array[0].time()
else:
self.totalLength = -1
def __iter__(self):
return self
def next(self):
try:
return self.array.pop()
except IndexError:
raise StopIteration
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment