Last active
January 8, 2017 20:01
-
-
Save dov/30a4bb762c522c404336ec855bc0f12b to your computer and use it in GitHub Desktop.
Realign srt files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file reads two subtitle srt files. The first one is assumed | |
# to be wrongly aligned, and the second one is correctly aligned. | |
# In additon the user needs to create a few alignment points. These | |
# will be used for rescaling the timescale of the first srt file | |
# so that it fits the second file. | |
# | |
# Dov Grobgeld | |
# [email protected] | |
import pandas as pd | |
import srt | |
def readsrt(filename): | |
subin = srt.subreader(open(filename)) | |
tstart_ary,tend_ary,text_ary = [],[],[] | |
for (tstart,tend),text in subin: | |
tstart_ary += [tstart] | |
tend_ary += [tend] | |
text_ary += [text] | |
return pd.DataFrame({'tstart':tstart_ary,'tend':tend_ary,'text':text_ary}) | |
def tosrt(df, filename,offset=1): | |
subout = srt.subwriter(open(filename,'w'),offset=offset) | |
for idx,row in df.iterrows(): | |
subout.write_record(((row.tstart,row.tend),row.text)) | |
def realign_srt(subject, reference, align_pairs): | |
'''Realigns the subject srt file to the reference srt file on the given alignment_points''' | |
df_realigned = subject.copy() | |
n = len(align_pairs) | |
for i in range(n-1): | |
ts1idx,ts2idx = [v-1 for v in align_pairs[i]] | |
tss1,tss2 = subject.tstart.ix[ts1idx],reference.tstart.ix[ts2idx] | |
te1idx,te2idx = [v-1 for v in align_pairs[i+1]] | |
tse1,tse2 = subject.tstart.ix[te1idx],reference.tstart.ix[te2idx] | |
slope = (1.0*(tse2-tss2)/(tse1-tss1)) | |
print('print tse1,tse2, slope, tse1*=', | |
srt.ms2time(tse1), | |
srt.ms2time(tse2), | |
srt.ms2time((tse1-tss1)*slope+tss2), | |
slope) | |
# Extrapolate to end of the file for the last point | |
if i==n-2: | |
te1idx = len(df_realigned)-1 | |
# Interpolate the times | |
for clm in ['tstart','tend']: | |
df_realigned.loc[ts1idx:te1idx+1,clm] = ( | |
(subject.loc[ts1idx:te1idx+1,clm] - tss1) * slope + tss2).astype(int) | |
return df_realigned | |
def realign_srt_files(subject_filename, reference_filename, align_pairs, | |
new_filename, | |
offset=1): | |
tosrt(realign_srt( | |
readsrt(subject_filename), | |
readsrt(reference_filename), | |
align_pairs), | |
new_filename, | |
offset=offset) | |
if __name__ == '__main__': | |
realign_srt_files( | |
'/tmp/heb1.srt', # 'bad.he.srt', | |
'/tmp/en-fixed.srt', #'good.en.srt', | |
align_pairs = [ | |
(3, 4), | |
(456,441) | |
], | |
new_filename = '/tmp/fixed1.srt', | |
offset=1 | |
) | |
realign_srt_files( | |
'/tmp/heb2.srt', # 'bad.he.srt', | |
'/tmp/en-fixed.srt', #'good.en.srt', | |
align_pairs = [ | |
(1, 442), | |
(380,821) | |
], | |
new_filename = '/tmp/fixed2.srt', | |
offset=457) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Copyleft 2011 wistful <wst public mail at gmail com> | |
# | |
# This is a free software; you can redistribute it and/or | |
# modify it under the terms of the GNU Lesser General Public | |
# License as published by the Free Software Foundation; either | |
# version 2.1 of the License, or (at your option) any later version. | |
# | |
# This library is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser General Public | |
# License along with this library; if not, write to the Free Software | |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
# | |
# Minor modified to be more pythonesque by Dov Grobgeld | |
__author__ = 'wistful' | |
import re | |
class SrtFormatError(Exception): | |
def __init__(self, message): | |
self.message = message | |
def __str__(self): | |
return repr(self.message) | |
def parse_time(str_time): | |
""" | |
convert string format of start-finish to integer(ms) format | |
>>> parse_time("00:14:33,460 --> 00:14:35,419") | |
(873460, 875419) | |
""" | |
pattern_time = r"(?P<h1>\d+):(?P<m1>\d+):(?P<s1>\d+),(?P<ms1>\d+)\W*-->\W*(?P<h2>\d+):(?P<m2>\d+):(?P<s2>\d+),(?P<ms2>\d+)$" | |
try: | |
d = re.match(pattern_time, str_time.strip()).groupdict() | |
except: | |
message = u"Invalid string format '%s' , expect hh:mm:ss,msc --> hh:mm:ss,msc" % str_time | |
raise SrtFormatError(message) | |
get_ms = lambda h, m, s, ms: (int(s) + int(m) * 60 + int(h) * 60 * 60) * 1000 + int(ms) | |
return get_ms(d['h1'], d['m1'], d['s1'], d['ms1']), get_ms(d['h2'], d['m2'], d['s2'], d['ms2']) | |
def ms2time(ms): | |
""" | |
convert msc to string format | |
>>> ms2time(233243) | |
'00:03:53,243' | |
>>> ms2time(442) | |
'00:00:00,442' | |
""" | |
it = int(ms / 1000) | |
ms = ms - it * 1000 | |
ss = it % 60 | |
mm = ((it - ss) / 60) % 60 | |
hh = ((it - (mm * 60) - ss) / 3600) % 60 | |
return "%02d:%02d:%02d,%03d" % (hh, mm, ss, ms) | |
def parse_ms(start, finish): | |
""" | |
convert msc representation to string format | |
>>> parse_ms(442, 233243) | |
'00:00:00,442 --> 00:03:53,243' | |
""" | |
return "%s --> %s" % (ms2time(start), ms2time(finish)) | |
def subreader(handle): | |
""" | |
return [((time_start, time_finish), subtitle_text), ...] | |
file_path: full path to srt-file | |
""" | |
pattern_index = r"^\d+$" | |
records, times, text = list(), None, list() | |
for line in handle: | |
# Get rid of bom markers! | |
line = line.replace('\xef\xbb\xbf','').strip() | |
if re.match(pattern_index, line): | |
if times: | |
yield (times, '\n'.join(text) + '\n') | |
times, text = None, list() | |
elif '-->' in line: | |
times = parse_time(line) | |
elif line: | |
text.append(line) | |
if times: | |
yield (times, '\n'.join(text) + '\n') | |
class subwriter: | |
def __init__(self, handle, offset=1): | |
self.handle = handle | |
self.index = offset | |
def write_record(self, record): | |
((start, finish), text) = record | |
self.handle.write("%s\n%s\n%s\n" % (str(self.index), parse_ms(start, finish), text)) | |
self.index+=1 | |
if __name__ == '__main__': | |
import doctest | |
print doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment