baxang · April 3, 2012 17:42
diff --git a/smi2srt.py b/smi2srt.py
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 '''
 @package smi2srt
 @brief this module is for convert .smi subtitle file into .srt subtitle 
 	(Request by Alfred Chae)

 Started : 2011/08/08
 license: GPL

 @version: 1.0.0
 @author: Moonchang Chae <[email protected]>


 SMI have this format!
 ===================================================================================================

 SRT have this format!
 ===================================================================================================
 1
 00:00:12,000 --> 00:00:15,123
 This is the first subtitle

 2
 00:00:16,000 --> 00:00:18,000
 Another subtitle demonstrating tags:
 <b>bold</b>, <i>italic</i>, <u>underlined</u>
 <font color="#ff0000">red text</font>

 3
 00:00:20,000 --> 00:00:22,000  X1:40 X2:600 Y1:20 Y2:50
 Another subtitle demonstrating position.
 '''
 __author__ = "MoonChang Chae <[email protected]>"
 __date__ = "2011/08/08"
 __version__ = "1.0.0"
 __version_info__ = (1, 0, 0)
 __license__ = "GCQVista's NDA"

 ###################################################################################################
 import os
 import sys
 import re
 import chardet #@UnresolvedImport

 ###################################################################################################
 def usage(msg=None, exit_code=1):
 	print_msg = """
 usage %s smifile.smi [...]
 	convert smi into srt subtitle file with same filename.
 	By MoonChang Chae <[email protected]>
 """ % os.path.basename(sys.argv[0])
 	if msg:
 		print_msg += '%s\n' % msg
 	print print_msg
 	sys.exit(exit_code)

 ###################################################################################################
 class smiItem(object):
 	def __init__(self):
 		self.start_ms = 0L
 		self.start_ts = '00:00:00,000'
 		self.end_ms = 0L
 		self.end_ts = '00:00:00,000'
 		self.contents = None
 		self.linecount = 0
 	@staticmethod
 	def ms2ts(ms):
 		hours = ms / 3600000L
 		ms -= hours * 3600000L
 		minutes = ms / 60000L
 		ms -= minutes * 60000L
 		seconds = ms / 1000L
 		ms -= seconds * 1000L
 		s = '%02d:%02d:%02d,%03d' % (hours, minutes, seconds, ms)
 		return s
 	def convertSrt(self):
 		if self.linecount == 4:
 			i=1 #@UnusedVariable
 		# 1) convert timestamp
 		self.start_ts = smiItem.ms2ts(self.start_ms)
 		self.end_ts = smiItem.ms2ts(self.end_ms-10)
 		# 2) remove new-line
 		self.contents = re.sub(r'\s+', ' ', self.contents)
 		# 3) remove web string like "&nbsp";
 		self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents)
 		# 4) replace "<br>" with '\n';
 		# self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE)
 		self.contents = re.sub(r'(<br>)+', '\n', self.contents)
 		# 5) find all tags
 		fndx = self.contents.find('<')
 		if fndx >= 0:
 			contents = self.contents
 			sb = self.contents[0:fndx]
 			contents = contents[fndx:]
 			while True:
 				m = re.match(r'</?([a-z]+)[^>]*>([^<>]*)', contents, flags=re.IGNORECASE)
 				if m == None: break
 				contents = contents[m.end(2):]
 				#if m.group(1).lower() in ['font', 'b', 'i', 'u']:
 				if m.group(1).lower() in ['b', 'i', 'u']:
 					sb += m.string[0:m.start(2)]
 				sb += m.group(2)
 			self.contents = sb
 		self.contents = self.contents.strip()
 		self.contents = self.contents.strip('\n')
 	def __repr__(self):
 		s = '%d:%d:<%s>:%d' % (self.start_ms, self.end_ms, self.contents, self.linecount)
 		return s

 ###################################################################################################
 def convertSMI(smi_file):
 	if not os.path.exists(smi_file):
 		sys.stderr.write('Cannot find smi file <%s>\n' % smi_file)
 		return False
 	rndx = smi_file.rfind('.')
 	srt_file = '%s.srt' % smi_file[0:rndx]

 	ifp = open(smi_file)
 	smi_sgml = ifp.read()#.upper()
 	ifp.close()
 	chdt = chardet.detect(smi_sgml)
 	if chdt['encoding'] != 'UTF-8':
 		smi_sgml = unicode(smi_sgml, chdt['encoding'].lower()).encode('utf-8')

 	# skip to first starting tag (skip first 0xff 0xfe ...)
 	try:
 		fndx = smi_sgml.find('<SYNC')
 	except Exception, e:
 		print chdt
 		raise e
 	if fndx < 0:
 		return False
 	smi_sgml = smi_sgml[fndx:]
 	lines = smi_sgml.split('\n')
 	
 	srt_list = []
 	sync_cont = ''
 	si = None
 	last_si = None
 	linecnt = 0
 	for line in lines:
 		linecnt += 1
 		sndx = line.upper().find('<SYNC')
 		if sndx >= 0:
 			m = re.search(r'<sync\s+start\s*=\s*(\d+)>(.*)$', line, flags=re.IGNORECASE)
 			if not m:
 				raise Exception('Invalid format tag of <Sync start=nnnn> with "%s"' % line)
 			sync_cont += line[0:sndx]
 			last_si = si
 			if last_si != None:
 				last_si.end_ms = long(m.group(1))
 				last_si.contents = sync_cont
 				srt_list.append(last_si)
 				last_si.linecount = linecnt
 				#print '[%06d] %s' % (linecnt, last_si)
 			sync_cont = m.group(2)
 			si = smiItem()
 			si.start_ms = long(m.group(1))
 		else:
 			sync_cont += line
 			
 	ofp = open(srt_file, 'w')
 	ndx = 1
 	for si in srt_list:
 		si.convertSrt()
 		if si.contents == None or len(si.contents) <= 0:
 			continue
 		#print si
 		sistr = '%d\n%s --> %s\n%s\n\n' % (ndx, si.start_ts, si.end_ts, si.contents)
 		#sistr = unicode(sistr, 'utf-8').encode('euc-kr')
 		ofp.write(sistr)
 		ndx += 1
 	ofp.close()
 	return True

 ###################################################################################################
 def doConvert():
 	if len(sys.argv) <= 1:
 		usage()
 	for smi_file in sys.argv[1:]:
 		if convertSMI(smi_file):
 			print "Converting <%s> OK!" % smi_file
 		else:
 			print "Converting <%s> Failture!" % smi_file
 			
 	
 ###################################################################################################
 if __name__ == '__main__':
 	doConvert()
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	'''
	@package smi2srt
	@brief this module is for convert .smi subtitle file into .srt subtitle
	(Request by Alfred Chae)

	Started : 2011/08/08
	license: GPL

	@version: 1.0.0
	@author: Moonchang Chae <[email protected]>


	SMI have this format!
	===================================================================================================

	SRT have this format!
	===================================================================================================
	1
	00:00:12,000 --> 00:00:15,123
	This is the first subtitle

	2
	00:00:16,000 --> 00:00:18,000
	Another subtitle demonstrating tags:
	<b>bold</b>, <i>italic</i>, <u>underlined</u>
	<font color="#ff0000">red text</font>

	3
	00:00:20,000 --> 00:00:22,000 X1:40 X2:600 Y1:20 Y2:50
	Another subtitle demonstrating position.
	'''
	__author__ = "MoonChang Chae <[email protected]>"
	__date__ = "2011/08/08"
	__version__ = "1.0.0"
	__version_info__ = (1, 0, 0)
	__license__ = "GCQVista's NDA"

	###################################################################################################
	import os
	import sys
	import re
	import chardet #@UnresolvedImport

	###################################################################################################
	def usage(msg=None, exit_code=1):
	print_msg = """
	usage %s smifile.smi [...]
	convert smi into srt subtitle file with same filename.
	By MoonChang Chae <[email protected]>
	""" % os.path.basename(sys.argv[0])
	if msg:
	print_msg += '%s\n' % msg
	print print_msg
	sys.exit(exit_code)

	###################################################################################################
	class smiItem(object):
	def __init__(self):
	self.start_ms = 0L
	self.start_ts = '00:00:00,000'
	self.end_ms = 0L
	self.end_ts = '00:00:00,000'
	self.contents = None
	self.linecount = 0
	@staticmethod
	def ms2ts(ms):
	hours = ms / 3600000L
	ms -= hours * 3600000L
	minutes = ms / 60000L
	ms -= minutes * 60000L
	seconds = ms / 1000L
	ms -= seconds * 1000L
	s = '%02d:%02d:%02d,%03d' % (hours, minutes, seconds, ms)
	return s
	def convertSrt(self):
	if self.linecount == 4:
	i=1 #@UnusedVariable
	# 1) convert timestamp
	self.start_ts = smiItem.ms2ts(self.start_ms)
	self.end_ts = smiItem.ms2ts(self.end_ms-10)
	# 2) remove new-line
	self.contents = re.sub(r'\s+', ' ', self.contents)
	# 3) remove web string like "&nbsp";
	self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents)
	# 4) replace "<br>" with '\n';
	# self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE)
	self.contents = re.sub(r'(<br>)+', '\n', self.contents)
	# 5) find all tags
	fndx = self.contents.find('<')
	if fndx >= 0:
	contents = self.contents
	sb = self.contents[0:fndx]
	contents = contents[fndx:]
	while True:
	m = re.match(r'</?([a-z]+)[^>]>([^<>])', contents, flags=re.IGNORECASE)
	if m == None: break
	contents = contents[m.end(2):]
	#if m.group(1).lower() in ['font', 'b', 'i', 'u']:
	if m.group(1).lower() in ['b', 'i', 'u']:
	sb += m.string[0:m.start(2)]
	sb += m.group(2)
	self.contents = sb
	self.contents = self.contents.strip()
	self.contents = self.contents.strip('\n')
	def __repr__(self):
	s = '%d:%d:<%s>:%d' % (self.start_ms, self.end_ms, self.contents, self.linecount)
	return s

	###################################################################################################
	def convertSMI(smi_file):
	if not os.path.exists(smi_file):
	sys.stderr.write('Cannot find smi file <%s>\n' % smi_file)
	return False
	rndx = smi_file.rfind('.')
	srt_file = '%s.srt' % smi_file[0:rndx]

	ifp = open(smi_file)
	smi_sgml = ifp.read()#.upper()
	ifp.close()
	chdt = chardet.detect(smi_sgml)
	if chdt['encoding'] != 'UTF-8':
	smi_sgml = unicode(smi_sgml, chdt['encoding'].lower()).encode('utf-8')

	# skip to first starting tag (skip first 0xff 0xfe ...)
	try:
	fndx = smi_sgml.find('<SYNC')
	except Exception, e:
	print chdt
	raise e
	if fndx < 0:
	return False
	smi_sgml = smi_sgml[fndx:]
	lines = smi_sgml.split('\n')

	srt_list = []
	sync_cont = ''
	si = None
	last_si = None
	linecnt = 0
	for line in lines:
	linecnt += 1
	sndx = line.upper().find('<SYNC')
	if sndx >= 0:
	m = re.search(r'<sync\s+start\s=\s(\d+)>(.*)$', line, flags=re.IGNORECASE)
	if not m:
	raise Exception('Invalid format tag of <Sync start=nnnn> with "%s"' % line)
	sync_cont += line[0:sndx]
	last_si = si
	if last_si != None:
	last_si.end_ms = long(m.group(1))
	last_si.contents = sync_cont
	srt_list.append(last_si)
	last_si.linecount = linecnt
	#print '[%06d] %s' % (linecnt, last_si)
	sync_cont = m.group(2)
	si = smiItem()
	si.start_ms = long(m.group(1))
	else:
	sync_cont += line

	ofp = open(srt_file, 'w')
	ndx = 1
	for si in srt_list:
	si.convertSrt()
	if si.contents == None or len(si.contents) <= 0:
	continue
	#print si
	sistr = '%d\n%s --> %s\n%s\n\n' % (ndx, si.start_ts, si.end_ts, si.contents)
	#sistr = unicode(sistr, 'utf-8').encode('euc-kr')
	ofp.write(sistr)
	ndx += 1
	ofp.close()
	return True

	###################################################################################################
	def doConvert():
	if len(sys.argv) <= 1:
	usage()
	for smi_file in sys.argv[1:]:
	if convertSMI(smi_file):
	print "Converting <%s> OK!" % smi_file
	else:
	print "Converting <%s> Failture!" % smi_file


	###################################################################################################
	if __name__ == '__main__':
	doConvert()