Created
April 3, 2012 17:42
-
-
Save baxang/2294028 to your computer and use it in GitHub Desktop.
convert script in SMI to SRT format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
''' | |
@package smi2srt | |
@brief this module is for convert .smi subtitle file into .srt subtitle | |
(Request by Alfred Chae) | |
Started : 2011/08/08 | |
license: GPL | |
@version: 1.0.0 | |
@author: Moonchang Chae <[email protected]> | |
SMI have this format! | |
=================================================================================================== | |
SRT have this format! | |
=================================================================================================== | |
1 | |
00:00:12,000 --> 00:00:15,123 | |
This is the first subtitle | |
2 | |
00:00:16,000 --> 00:00:18,000 | |
Another subtitle demonstrating tags: | |
<b>bold</b>, <i>italic</i>, <u>underlined</u> | |
<font color="#ff0000">red text</font> | |
3 | |
00:00:20,000 --> 00:00:22,000 X1:40 X2:600 Y1:20 Y2:50 | |
Another subtitle demonstrating position. | |
''' | |
__author__ = "MoonChang Chae <[email protected]>" | |
__date__ = "2011/08/08" | |
__version__ = "1.0.0" | |
__version_info__ = (1, 0, 0) | |
__license__ = "GCQVista's NDA" | |
################################################################################################### | |
import os | |
import sys | |
import re | |
import chardet #@UnresolvedImport | |
################################################################################################### | |
def usage(msg=None, exit_code=1): | |
print_msg = """ | |
usage %s smifile.smi [...] | |
convert smi into srt subtitle file with same filename. | |
By MoonChang Chae <[email protected]> | |
""" % os.path.basename(sys.argv[0]) | |
if msg: | |
print_msg += '%s\n' % msg | |
print print_msg | |
sys.exit(exit_code) | |
################################################################################################### | |
class smiItem(object): | |
def __init__(self): | |
self.start_ms = 0L | |
self.start_ts = '00:00:00,000' | |
self.end_ms = 0L | |
self.end_ts = '00:00:00,000' | |
self.contents = None | |
self.linecount = 0 | |
@staticmethod | |
def ms2ts(ms): | |
hours = ms / 3600000L | |
ms -= hours * 3600000L | |
minutes = ms / 60000L | |
ms -= minutes * 60000L | |
seconds = ms / 1000L | |
ms -= seconds * 1000L | |
s = '%02d:%02d:%02d,%03d' % (hours, minutes, seconds, ms) | |
return s | |
def convertSrt(self): | |
if self.linecount == 4: | |
i=1 #@UnusedVariable | |
# 1) convert timestamp | |
self.start_ts = smiItem.ms2ts(self.start_ms) | |
self.end_ts = smiItem.ms2ts(self.end_ms-10) | |
# 2) remove new-line | |
self.contents = re.sub(r'\s+', ' ', self.contents) | |
# 3) remove web string like " "; | |
self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents) | |
# 4) replace "<br>" with '\n'; | |
# self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE) | |
self.contents = re.sub(r'(<br>)+', '\n', self.contents) | |
# 5) find all tags | |
fndx = self.contents.find('<') | |
if fndx >= 0: | |
contents = self.contents | |
sb = self.contents[0:fndx] | |
contents = contents[fndx:] | |
while True: | |
m = re.match(r'</?([a-z]+)[^>]*>([^<>]*)', contents, flags=re.IGNORECASE) | |
if m == None: break | |
contents = contents[m.end(2):] | |
#if m.group(1).lower() in ['font', 'b', 'i', 'u']: | |
if m.group(1).lower() in ['b', 'i', 'u']: | |
sb += m.string[0:m.start(2)] | |
sb += m.group(2) | |
self.contents = sb | |
self.contents = self.contents.strip() | |
self.contents = self.contents.strip('\n') | |
def __repr__(self): | |
s = '%d:%d:<%s>:%d' % (self.start_ms, self.end_ms, self.contents, self.linecount) | |
return s | |
################################################################################################### | |
def convertSMI(smi_file): | |
if not os.path.exists(smi_file): | |
sys.stderr.write('Cannot find smi file <%s>\n' % smi_file) | |
return False | |
rndx = smi_file.rfind('.') | |
srt_file = '%s.srt' % smi_file[0:rndx] | |
ifp = open(smi_file) | |
smi_sgml = ifp.read()#.upper() | |
ifp.close() | |
chdt = chardet.detect(smi_sgml) | |
if chdt['encoding'] != 'UTF-8': | |
smi_sgml = unicode(smi_sgml, chdt['encoding'].lower()).encode('utf-8') | |
# skip to first starting tag (skip first 0xff 0xfe ...) | |
try: | |
fndx = smi_sgml.find('<SYNC') | |
except Exception, e: | |
print chdt | |
raise e | |
if fndx < 0: | |
return False | |
smi_sgml = smi_sgml[fndx:] | |
lines = smi_sgml.split('\n') | |
srt_list = [] | |
sync_cont = '' | |
si = None | |
last_si = None | |
linecnt = 0 | |
for line in lines: | |
linecnt += 1 | |
sndx = line.upper().find('<SYNC') | |
if sndx >= 0: | |
m = re.search(r'<sync\s+start\s*=\s*(\d+)>(.*)$', line, flags=re.IGNORECASE) | |
if not m: | |
raise Exception('Invalid format tag of <Sync start=nnnn> with "%s"' % line) | |
sync_cont += line[0:sndx] | |
last_si = si | |
if last_si != None: | |
last_si.end_ms = long(m.group(1)) | |
last_si.contents = sync_cont | |
srt_list.append(last_si) | |
last_si.linecount = linecnt | |
#print '[%06d] %s' % (linecnt, last_si) | |
sync_cont = m.group(2) | |
si = smiItem() | |
si.start_ms = long(m.group(1)) | |
else: | |
sync_cont += line | |
ofp = open(srt_file, 'w') | |
ndx = 1 | |
for si in srt_list: | |
si.convertSrt() | |
if si.contents == None or len(si.contents) <= 0: | |
continue | |
#print si | |
sistr = '%d\n%s --> %s\n%s\n\n' % (ndx, si.start_ts, si.end_ts, si.contents) | |
#sistr = unicode(sistr, 'utf-8').encode('euc-kr') | |
ofp.write(sistr) | |
ndx += 1 | |
ofp.close() | |
return True | |
################################################################################################### | |
def doConvert(): | |
if len(sys.argv) <= 1: | |
usage() | |
for smi_file in sys.argv[1:]: | |
if convertSMI(smi_file): | |
print "Converting <%s> OK!" % smi_file | |
else: | |
print "Converting <%s> Failture!" % smi_file | |
################################################################################################### | |
if __name__ == '__main__': | |
doConvert() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment