Created
September 30, 2016 01:52
-
-
Save bmschmidt/07dfa3dba5dff5a71727fcd3c0618d24 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import gzip | |
import sys | |
def stripBadText(string): | |
if string==None: | |
return "" | |
# No html tags | |
string = re.sub("<[^>]+>","",string) | |
# People don't talk in [brackets] or (inside parentheses), so I strip them. | |
string = re.sub(r"\[([^\]]+)\]","",string) | |
string = re.sub(r"\(([^\)]+)\)","",string) | |
# ellipses can be used to separate one set of tokens from another. | |
string = string.replace("..."," ") | |
# And the use of dancy music theme things is irregular. | |
string = string.replace("\xe2"," ") | |
return string | |
class srtBlock(str): | |
""" | |
A string of text, initialized from an srt block, | |
which parses the metadata out into separate methods. | |
""" | |
def __init__(self,blockArray): | |
self.blocknum = blockArray.pop(0) | |
self.timestamp = blockArray.pop(0) | |
text = " ".join([stripBadText(string) for string in blockArray]) | |
super(srtBlock,self).__init__(text) | |
self.text = text | |
def time(self,which="start"): | |
metadata = dict() | |
try: | |
timeHits = re.findall("^(\d\d):(\d\d):(\d\d)",self.timestamp) | |
if which=="start": | |
time = timeHits[0] | |
elif which=="end": | |
time = timeHits[1] | |
except IndexError: | |
sys.stderr.write("Couldn't parse " + self.timestamp + "\n") | |
return -1 | |
value = int(time[0])*60*60 + int(time[1])*60 + int(time[2]) | |
#value = int(value/chunkSize)*chunkSize/60 | |
return value | |
def textContent(self): | |
text = self.text | |
return text | |
class srtGroup(object): | |
""" | |
An object that is initialized with an array of srt filenames: | |
It builds them as a group into a set of dictionaries that includes | |
minute, percentage, twelfth of the way through information as well as | |
the text. | |
""" | |
def __init__(self,files): | |
self.targets = files | |
self.files = [srtFile(file) for file in files] | |
def strings(self): | |
totalLength = sum([file.totalLength for file in self.files]) | |
self.totalLength = totalLength | |
seenSoFar=0 | |
chunks = dict() | |
for file in self.files: | |
for block in file: | |
try: | |
key = seenSoFar + block.time() | |
except TypeError: | |
print self.targets | |
print seenSoFar | |
print block.time() | |
raise | |
try: | |
chunks[key]['text'] = chunks[key]['text'] + '\f' + block.text | |
except KeyError: | |
chunks[key] = dict() | |
chunks[key]['second'] = key | |
chunks[key]['text'] = block.text | |
seenSoFar = seenSoFar + file.totalLength | |
return [chunks[key] for key in chunks.keys()] | |
def documents(self,minChunk=120): | |
chunks = dict() | |
num = 0 | |
for string in self.strings(): | |
num += 1 | |
i=3 | |
going = True | |
while going: | |
#THis loop keeps assigning increasingly small chunks (6ths, 12ths, etc) to something, until the number falls below the threshold. | |
key = nameChunk(i) | |
try: | |
string[key]=int((string['second']*i)/(self.totalLength*1.000001)) + 1 | |
except ZeroDivisionError: | |
print string['text'] | |
raise | |
if ((self.totalLength/i) < (minChunk*2)): #times two, because we're letting one slip through the cracks. | |
string['maxChunk'] = key | |
string['minute'] = int(string['second']/60) | |
going = False | |
i=i*2 | |
key = string[key] | |
try: | |
chunks[key]['text'] = chunks[key]['text'] + "\f" + string['text'] | |
except KeyError: | |
chunks[key] = dict() | |
chunks[key]['text'] = string['text'] | |
for key2 in string.keys(): | |
if key2 != "text": | |
chunks[key][key2] = string[key2] | |
del string['second'] | |
return [chunks[key] for key in chunks.keys()] | |
def nameChunk(i): | |
if i==3: | |
return "3rd" | |
else: | |
return str(i) + "th" | |
class srtFile(object): | |
def __init__(self,filename): | |
self.filename=filename | |
self.source = gzip.open(filename,"r") | |
self.readBlocks() | |
def readBlocks(self): | |
blocks = "".join(self.source.readlines()) | |
blocks = re.sub(r"\r\n","\n",blocks) | |
blocks = blocks.split("\n\n") | |
self.blocks = [block.split("\n") for block in blocks] | |
self.array = [] | |
while len(self.blocks) > 0: | |
thisBlock = self.blocks.pop() | |
if len(thisBlock)>1: | |
theBlock = (srtBlock(thisBlock)) | |
if theBlock.time() > -1: | |
self.array.append(theBlock) | |
if (len(self.array) > 0): | |
self.totalLength = self.array[0].time() | |
else: | |
self.totalLength = -1 | |
def __iter__(self): | |
return self | |
def next(self): | |
try: | |
return self.array.pop() | |
except IndexError: | |
raise StopIteration |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment