Last active
November 20, 2016 16:35
-
-
Save taylor224/4c80ad3d047af48aa4a0cc64baee22aa to your computer and use it in GitHub Desktop.
Automatic SMI to SRT convert script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Find SMI file recursively script by Taylor Starfield | |
FindRoot=/top/directory/to/convert | |
find $FindRoot -name '[^.]*.smi' | while read line; do | |
if [ -f "${line%.smi}.srt" ]; then | |
continue | |
fi | |
python convert.py "$line" | |
chmod +x "${line%.smi}.srt" | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
''' | |
@package smi2srt | |
@brief this module is for convert .smi subtitle file into .srt subtitle | |
(Request by Alfred Chae) | |
Started : 2011/08/08 | |
license: GPL | |
@version: 1.0.0 | |
@author: Moonchang Chae <[email protected]> | |
SMI have this format! | |
=================================================================================================== | |
SRT have this format! | |
=================================================================================================== | |
1 | |
00:00:12,000 --> 00:00:15,123 | |
This is the first subtitle | |
2 | |
00:00:16,000 --> 00:00:18,000 | |
Another subtitle demonstrating tags: | |
<b>bold</b>, <i>italic</i>, <u>underlined</u> | |
<font color="#ff0000">red text</font> | |
3 | |
00:00:20,000 --> 00:00:22,000 X1:40 X2:600 Y1:20 Y2:50 | |
Another subtitle demonstrating position. | |
''' | |
__author__ = "MoonChang Chae <[email protected]>" | |
__date__ = "2011/08/08" | |
__version__ = "1.0.0" | |
__version_info__ = (1, 0, 0) | |
__license__ = "GCQVista's NDA" | |
################################################################################################### | |
import os | |
import sys | |
import re | |
import chardet #@UnresolvedImport | |
################################################################################################### | |
def usage(msg=None, exit_code=1): | |
print_msg = """ | |
usage %s smifile.smi [...] | |
convert smi into srt subtitle file with same filename. | |
By MoonChang Chae <[email protected]> | |
""" % os.path.basename(sys.argv[0]) | |
if msg: | |
print_msg += '%s\n' % msg | |
print print_msg | |
sys.exit(exit_code) | |
################################################################################################### | |
class smiItem(object): | |
def __init__(self): | |
self.start_ms = 0L | |
self.start_ts = '00:00:00,000' | |
self.end_ms = 0L | |
self.end_ts = '00:00:00,000' | |
self.contents = None | |
self.linecount = 0 | |
@staticmethod | |
def ms2ts(ms): | |
hours = ms / 3600000L | |
ms -= hours * 3600000L | |
minutes = ms / 60000L | |
ms -= minutes * 60000L | |
seconds = ms / 1000L | |
ms -= seconds * 1000L | |
s = '%02d:%02d:%02d,%03d' % (hours, minutes, seconds, ms) | |
return s | |
def convertSrt(self): | |
if self.linecount == 4: | |
i=1 #@UnusedVariable | |
# 1) convert timestamp | |
self.start_ts = smiItem.ms2ts(self.start_ms) | |
self.end_ts = smiItem.ms2ts(self.end_ms-10) | |
# 2) remove new-line | |
self.contents = re.sub(r'\s+', ' ', self.contents) | |
# 3) remove web string like " "; | |
self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents) | |
# 4) replace "<br>" with '\n'; | |
self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE) | |
# 5) find all tags | |
fndx = self.contents.find('<') | |
if fndx >= 0: | |
contents = self.contents | |
sb = self.contents[0:fndx] | |
contents = contents[fndx:] | |
while True: | |
m = re.match(r'</?([a-z]+)[^>]*>([^<>]*)', contents, flags=re.IGNORECASE) | |
if m == None: break | |
contents = contents[m.end(2):] | |
#if m.group(1).lower() in ['font', 'b', 'i', 'u']: | |
if m.group(1).lower() in ['b', 'i', 'u']: | |
sb += m.string[0:m.start(2)] | |
sb += m.group(2) | |
self.contents = sb | |
self.contents = self.contents.strip() | |
self.contents = self.contents.strip('\n') | |
def __repr__(self): | |
s = '%d:%d:<%s>:%d' % (self.start_ms, self.end_ms, self.contents, self.linecount) | |
return s | |
################################################################################################### | |
def convertSMI(smi_file): | |
if not os.path.exists(smi_file): | |
sys.stderr.write('Cannot find smi file <%s>\n' % smi_file) | |
return False | |
rndx = smi_file.rfind('.') | |
srt_file = '%s.srt' % smi_file[0:rndx] | |
ifp = open(smi_file) | |
smi_sgml = ifp.read()#.upper() | |
ifp.close() | |
chdt = chardet.detect(smi_sgml) | |
if not chdt.get('encoding'): | |
pass | |
elif chdt['encoding'] != 'UTF-8': | |
smi_sgml = unicode(smi_sgml, chdt['encoding'].lower()).encode('utf-8') | |
# skip to first starting tag (skip first 0xff 0xfe ...) | |
try: | |
fndx = smi_sgml.lower().find('<sync') | |
except Exception, e: | |
print chdt | |
raise e | |
if fndx < 0: | |
return False | |
smi_sgml = smi_sgml[fndx:] | |
lines = smi_sgml.split('\n') | |
srt_list = [] | |
sync_cont = '' | |
si = None | |
last_si = None | |
linecnt = 0 | |
for line in lines: | |
linecnt += 1 | |
sndx = line.upper().find('<SYNC') | |
if sndx >= 0: | |
m = re.search(r'<sync\s+start\s*=\s*(\d+)>(.*)$', line, flags=re.IGNORECASE) | |
if not m: | |
raise Exception('Invalid format tag of <Sync start=nnnn> with "%s"' % line) | |
sync_cont += line[0:sndx] | |
last_si = si | |
if last_si != None: | |
last_si.end_ms = long(m.group(1)) | |
last_si.contents = sync_cont | |
srt_list.append(last_si) | |
last_si.linecount = linecnt | |
#print '[%06d] %s' % (linecnt, last_si) | |
sync_cont = m.group(2) | |
si = smiItem() | |
si.start_ms = long(m.group(1)) | |
else: | |
sync_cont += line | |
ofp = open(srt_file, 'w') | |
ndx = 1 | |
for si in srt_list: | |
si.convertSrt() | |
if si.contents == None or len(si.contents) <= 0: | |
continue | |
#print si | |
sistr = '%d\n%s --> %s\n%s\n\n' % (ndx, si.start_ts, si.end_ts, si.contents) | |
#sistr = unicode(sistr, 'utf-8').encode('euc-kr') | |
ofp.write(sistr) | |
ndx += 1 | |
ofp.close() | |
return True | |
################################################################################################### | |
def doConvert(): | |
if len(sys.argv) <= 1: | |
usage() | |
for smi_file in sys.argv[1:]: | |
if convertSMI(smi_file): | |
print "Converting <%s> OK!" % smi_file | |
else: | |
print "Converting <%s> Failture!" % smi_file | |
################################################################################################### | |
if __name__ == '__main__': | |
doConvert() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Automatic convert SMI to SRT script
Just set "FindRoot" to top directory of to convert directory and regist auto_convert.sh to crontab
This script will convert all of SMI file to SRT at same directory with original SMI file where under "FindRoot" directory recursively
But original SMI file will still exist even though converted
Special thanks to Moonchang Chae. I changed some code to solve bug in his code.