Skip to content

Instantly share code, notes, and snippets.

@dov
Last active January 8, 2017 20:01
Show Gist options
  • Save dov/30a4bb762c522c404336ec855bc0f12b to your computer and use it in GitHub Desktop.
Save dov/30a4bb762c522c404336ec855bc0f12b to your computer and use it in GitHub Desktop.
Realign srt files
# This file reads two subtitle srt files. The first one is assumed
# to be wrongly aligned, and the second one is correctly aligned.
# In additon the user needs to create a few alignment points. These
# will be used for rescaling the timescale of the first srt file
# so that it fits the second file.
#
# Dov Grobgeld
# [email protected]
import pandas as pd
import srt
def readsrt(filename):
subin = srt.subreader(open(filename))
tstart_ary,tend_ary,text_ary = [],[],[]
for (tstart,tend),text in subin:
tstart_ary += [tstart]
tend_ary += [tend]
text_ary += [text]
return pd.DataFrame({'tstart':tstart_ary,'tend':tend_ary,'text':text_ary})
def tosrt(df, filename,offset=1):
subout = srt.subwriter(open(filename,'w'),offset=offset)
for idx,row in df.iterrows():
subout.write_record(((row.tstart,row.tend),row.text))
def realign_srt(subject, reference, align_pairs):
'''Realigns the subject srt file to the reference srt file on the given alignment_points'''
df_realigned = subject.copy()
n = len(align_pairs)
for i in range(n-1):
ts1idx,ts2idx = [v-1 for v in align_pairs[i]]
tss1,tss2 = subject.tstart.ix[ts1idx],reference.tstart.ix[ts2idx]
te1idx,te2idx = [v-1 for v in align_pairs[i+1]]
tse1,tse2 = subject.tstart.ix[te1idx],reference.tstart.ix[te2idx]
slope = (1.0*(tse2-tss2)/(tse1-tss1))
print('print tse1,tse2, slope, tse1*=',
srt.ms2time(tse1),
srt.ms2time(tse2),
srt.ms2time((tse1-tss1)*slope+tss2),
slope)
# Extrapolate to end of the file for the last point
if i==n-2:
te1idx = len(df_realigned)-1
# Interpolate the times
for clm in ['tstart','tend']:
df_realigned.loc[ts1idx:te1idx+1,clm] = (
(subject.loc[ts1idx:te1idx+1,clm] - tss1) * slope + tss2).astype(int)
return df_realigned
def realign_srt_files(subject_filename, reference_filename, align_pairs,
new_filename,
offset=1):
tosrt(realign_srt(
readsrt(subject_filename),
readsrt(reference_filename),
align_pairs),
new_filename,
offset=offset)
if __name__ == '__main__':
realign_srt_files(
'/tmp/heb1.srt', # 'bad.he.srt',
'/tmp/en-fixed.srt', #'good.en.srt',
align_pairs = [
(3, 4),
(456,441)
],
new_filename = '/tmp/fixed1.srt',
offset=1
)
realign_srt_files(
'/tmp/heb2.srt', # 'bad.he.srt',
'/tmp/en-fixed.srt', #'good.en.srt',
align_pairs = [
(1, 442),
(380,821)
],
new_filename = '/tmp/fixed2.srt',
offset=457)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Copyleft 2011 wistful <wst public mail at gmail com>
#
# This is a free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Minor modified to be more pythonesque by Dov Grobgeld
__author__ = 'wistful'
import re
class SrtFormatError(Exception):
def __init__(self, message):
self.message = message
def __str__(self):
return repr(self.message)
def parse_time(str_time):
"""
convert string format of start-finish to integer(ms) format
>>> parse_time("00:14:33,460 --> 00:14:35,419")
(873460, 875419)
"""
pattern_time = r"(?P<h1>\d+):(?P<m1>\d+):(?P<s1>\d+),(?P<ms1>\d+)\W*-->\W*(?P<h2>\d+):(?P<m2>\d+):(?P<s2>\d+),(?P<ms2>\d+)$"
try:
d = re.match(pattern_time, str_time.strip()).groupdict()
except:
message = u"Invalid string format '%s' , expect hh:mm:ss,msc --> hh:mm:ss,msc" % str_time
raise SrtFormatError(message)
get_ms = lambda h, m, s, ms: (int(s) + int(m) * 60 + int(h) * 60 * 60) * 1000 + int(ms)
return get_ms(d['h1'], d['m1'], d['s1'], d['ms1']), get_ms(d['h2'], d['m2'], d['s2'], d['ms2'])
def ms2time(ms):
"""
convert msc to string format
>>> ms2time(233243)
'00:03:53,243'
>>> ms2time(442)
'00:00:00,442'
"""
it = int(ms / 1000)
ms = ms - it * 1000
ss = it % 60
mm = ((it - ss) / 60) % 60
hh = ((it - (mm * 60) - ss) / 3600) % 60
return "%02d:%02d:%02d,%03d" % (hh, mm, ss, ms)
def parse_ms(start, finish):
"""
convert msc representation to string format
>>> parse_ms(442, 233243)
'00:00:00,442 --> 00:03:53,243'
"""
return "%s --> %s" % (ms2time(start), ms2time(finish))
def subreader(handle):
"""
return [((time_start, time_finish), subtitle_text), ...]
file_path: full path to srt-file
"""
pattern_index = r"^\d+$"
records, times, text = list(), None, list()
for line in handle:
# Get rid of bom markers!
line = line.replace('\xef\xbb\xbf','').strip()
if re.match(pattern_index, line):
if times:
yield (times, '\n'.join(text) + '\n')
times, text = None, list()
elif '-->' in line:
times = parse_time(line)
elif line:
text.append(line)
if times:
yield (times, '\n'.join(text) + '\n')
class subwriter:
def __init__(self, handle, offset=1):
self.handle = handle
self.index = offset
def write_record(self, record):
((start, finish), text) = record
self.handle.write("%s\n%s\n%s\n" % (str(self.index), parse_ms(start, finish), text))
self.index+=1
if __name__ == '__main__':
import doctest
print doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment