Last active
January 24, 2023 03:25
-
-
Save trustin/e06fadcbca90985b1df4dafebd9f5d25 to your computer and use it in GitHub Desktop.
smi2ass.py - converts a SAMI (.smi) subtitle file to the SubStationAlpha (.ass) format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: UTF-8 -*- | |
# Note: A newer version of this script is located at https://github.com/trustin/smi2ass | |
# | |
# Copyright (C) 2018 Trustin Heuiseung Lee and other contributors | |
# | |
# This program is free software; you can redistribute it and/or | |
# modify it under the terms of the GNU General Public License | |
# as published by the Free Software Foundation; either version 2 | |
# of the License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
# | |
# Forked from: https://github.com/hojel/service.subtitles.gomtv/blob/3a7342961e140eaf8250659b0ac6158ce5e6bc5c/resources/lib | |
import os, sys, re | |
import HTMLParser | |
from collections import defaultdict | |
from operator import itemgetter | |
from BeautifulSoup import BeautifulSoup | |
defaultLangCode = 'kor' | |
defaultFontName = 'sans-serif' | |
# lang class for multiple language subtitle | |
langCode = {'KRCC':'kor','KOCC':'kor','KR':'kor','KO':'kor','KOREANSC':'kor','KRC':'kor', | |
'ENCC':'eng','EGCC':'eng','EN':'eng','EnglishSC':'eng','ENUSCC':'eng','ERCC':'eng', | |
'CNCC':'chi','JPCC':'jpn','UNKNOWNCC':'und','COMMENTARY':'commentary' | |
} | |
scriptInfo =\ | |
"""[Script Info] | |
;This is an Advanced Sub Station Alpha v4+ script. | |
;Converted by smi2ass | |
ScriptType: v4.00+ | |
Collisions: Normal | |
PlayResX: 384 | |
PlayResY: 288 | |
Timer: 100.0000 | |
""" | |
styles=\ | |
""" | |
[V4+ Styles] | |
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding | |
Style: Default,""" + defaultFontName + """,22,&H00ffffff,&H0000ffff,&H00000000,&H80000000,0,0,0,0,100,100,0,0.00,1,1,1,2,20,20,20,1 | |
""" | |
events=\ | |
""" | |
[Events] | |
Format: Layer, Start, End, Style, Actor, MarginL, MarginR, MarginV, Effect, Text | |
""" | |
# for color code conversion including some common typos | |
css3_names_to_hex = { | |
'aliceblue': '#f0f8ff', | |
'antiquewhite': '#faebd7', | |
'aqua': '#00ffff', | |
'aquamarine': '#7fffd4', | |
'azure': '#f0ffff', | |
'beige': '#f5f5dc', | |
'bisque': '#ffe4c4', | |
'black': '#000000', | |
'blanchedalmond': '#ffebcd', | |
'blue': '#0000ff', | |
'blueviolet': '#8a2be2', | |
'brown': '#a52a2a', | |
'burlywood': '#deb887', | |
'cadetblue': '#5f9ea0', | |
'chartreuse': '#7fff00', | |
'chocolate': '#d2691e', | |
'coral': '#ff7f50', | |
'cornflowerblue': '#6495ed', | |
'cornsilk': '#fff8dc', | |
'crimson': '#dc143c', | |
'cyan': '#00ffff', | |
'darkblue': '#00008b', | |
'darkcyan': '#008b8b', | |
'darkgoldenrod': '#b8860b', | |
'darkgray': '#a9a9a9', | |
'darkgrey': '#a9a9a9', | |
'darkgreen': '#006400', | |
'darkkhaki': '#bdb76b', | |
'darkmagenta': '#8b008b', | |
'darkolivegreen': '#556b2f', | |
'darkorange': '#ff8c00', | |
'darkorchid': '#9932cc', | |
'darkred': '#8b0000', | |
'darksalmon': '#e9967a', | |
'darkseagreen': '#8fbc8f', | |
'darkslateblue': '#483d8b', | |
'darkslategray': '#2f4f4f', | |
'darkslategrey': '#2f4f4f', | |
'darkturquoise': '#00ced1', | |
'darkviolet': '#9400d3', | |
'deeppink': '#ff1493', | |
'deepskyblue': '#00bfff', | |
'dimgray': '#696969', | |
'dimgrey': '#696969', | |
'dodgerblue': '#1e90ff', | |
'firebrick': '#b22222', | |
'floralwhite': '#fffaf0', | |
'forestgreen': '#228b22', | |
'fuchsia': '#ff00ff', | |
'gainsboro': '#dcdcdc', | |
'ghostwhite': '#f8f8ff', | |
'gold': '#ffd700', | |
'goldenrod': '#daa520', | |
'gray': '#808080', | |
'grey': '#808080', | |
'green': '#008000', | |
'greenyellow': '#adff2f', | |
'honeydew': '#f0fff0', | |
'hotpink': '#ff69b4', | |
'indianred': '#cd5c5c', | |
'indigo': '#4b0082', | |
'ivory': '#fffff0', | |
'khaki': '#f0e68c', | |
'lavender': '#e6e6fa', | |
'lavenderblush': '#fff0f5', | |
'lawngreen': '#7cfc00', | |
'lemonchiffon': '#fffacd', | |
'lightblue': '#add8e6', | |
'lightcoral': '#f08080', | |
'lightcyan': '#e0ffff', | |
'lightgoldenrodyellow': '#fafad2', | |
'lightgray': '#d3d3d3', | |
'lightgrey': '#d3d3d3', | |
'lightgreen': '#90ee90', | |
'lightpink': '#ffb6c1', | |
'lightsalmon': '#ffa07a', | |
'lightseagreen': '#20b2aa', | |
'lightskyblue': '#87cefa', | |
'lightslategray': '#778899', | |
'lightslategrey': '#778899', | |
'lightsteelblue': '#b0c4de', | |
'lightyellow': '#ffffe0', | |
'lime': '#00ff00', | |
'limegreen': '#32cd32', | |
'linen': '#faf0e6', | |
'magenta': '#ff00ff', | |
'maroon': '#800000', | |
'mediumaquamarine': '#66cdaa', | |
'mediumblue': '#0000cd', | |
'mediumorchid': '#ba55d3', | |
'mediumpurple': '#9370d8', | |
'mediumseagreen': '#3cb371', | |
'mediumslateblue': '#7b68ee', | |
'mediumspringgreen': '#00fa9a', | |
'mediumturquoise': '#48d1cc', | |
'mediumvioletred': '#c71585', | |
'midnightblue': '#191970', | |
'mintcream': '#f5fffa', | |
'mistyrose': '#ffe4e1', | |
'moccasin': '#ffe4b5', | |
'navajowhite': '#ffdead', | |
'navy': '#000080', | |
'oldlace': '#fdf5e6', | |
'olive': '#808000', | |
'olivedrab': '#6b8e23', | |
'orange': '#ffa500', | |
'orangered': '#ff4500', | |
'orchid': '#da70d6', | |
'palegoldenrod': '#eee8aa', | |
'palegreen': '#98fb98', | |
'paleturquoise': '#afeeee', | |
'palevioletred': '#d87093', | |
'papayawhip': '#ffefd5', | |
'peachpuff': '#ffdab9', | |
'peru': '#cd853f', | |
'pink': '#ffc0cb', | |
'plum': '#dda0dd', | |
'powderblue': '#b0e0e6', | |
'purple': '#800080', | |
'red': '#ff0000', | |
'rosybrown': '#bc8f8f', | |
'royalblue': '#4169e1', | |
'saddlebrown': '#8b4513', | |
'salmon': '#fa8072', | |
'sandybrown': '#f4a460', | |
'scarlet': '#9c0606', | |
'seagreen': '#2e8b57', | |
'seashell': '#fff5ee', | |
'sienna': '#a0522d', | |
'silver': '#c0c0c0', | |
'skyblue': '#87ceeb', | |
'slateblue': '#6a5acd', | |
'slategray': '#708090', | |
'slategrey': '#708090', | |
'snow': '#fffafa', | |
'springgreen': '#00ff7f', | |
'steelblue': '#4682b4', | |
'tan': '#d2b48c', | |
'teal': '#008080', | |
'thistle': '#d8bfd8', | |
'tomato': '#ff6347', | |
'turquoise': '#40e0d0', | |
'violet': '#ee82ee', | |
'wheat': '#f5deb3', | |
'white': '#ffffff', | |
'whitesmoke': '#f5f5f5', | |
'yellow': '#ffff00', | |
'yellowgreen': '#9acd32', | |
} | |
spaceChars = [ | |
u'\u00A0', u'\u180E', u'\u2000', u'\u2001', u'\u2002', u'\u2003', u'\u2004', u'\u2005', u'\u2006', | |
u'\u2007', u'\u2008', u'\u2009', u'\u200A', u'\u200B', u'\u202F', u'\u205F', u'\u3000' ] | |
def smi2ass(smi_sgml): | |
# check character encoding and covert to UTF-8 | |
smi_sgml = toUnicodeString(smi_sgml) | |
# CRLF or LF to a whitespace | |
smi_sgml = smi_sgml.replace(u'\u000D\u000A', u' ') | |
smi_sgml = smi_sgml.replace(u'\u000A', u' ') | |
smi_sgml = smi_sgml.replace(u'\u000D', u' ') | |
# Replace special space characters so that BeautifulSoup can't remove them. | |
for spaceChar in spaceChars: | |
smi_sgml = smi_sgml.replace(spaceChar, 'smi2ass_unicode(' + str(ord(spaceChar)) + ')') | |
# Replace spaces around a tag with ' ' so that they are not stripped when we replace a tag. | |
smi_sgml = re.sub(r'> +<', '>smi2ass_unicode(32)<', smi_sgml) | |
smi_sgml = re.sub(r'> +', '>smi2ass_unicode(32)', smi_sgml) | |
smi_sgml = re.sub(r' +<', 'smi2ass_unicode(32)<', smi_sgml) | |
# but not <rt> tags | |
smi_sgml = re.sub(r'<rt>(smi2ass_unicode\([0-9]+\))+', '<rt>', smi_sgml) | |
smi_sgml = re.sub(r'(smi2ass_unicode\([0-9]+\))+</rt>', '</rt>', smi_sgml) | |
#Parse lines with BeautifulSoup based on sync tag | |
#pool = BeautifulSoup(smi_sgml, fromEncoding='utf-8') | |
pool = BeautifulSoup(smi_sgml) | |
smiLines = pool.findAll('sync') | |
#separate multi-language subtitle into a sperate list | |
mln, longlang = multiLanguageSeperation(smiLines) | |
assDict = {} | |
for langIndex, lang in enumerate(mln): | |
asslines = smiToassSynax (mln[lang]) | |
if len(asslines) > 0: | |
asscontents = (scriptInfo+styles+events+''.join(asslines)).encode('utf-8') | |
assDict[longlang[langIndex]] = asscontents | |
return assDict | |
def smiToassSynax (sln): | |
htmlParser = HTMLParser.HTMLParser() | |
asslines = [] | |
for lineIndex, item in enumerate(sln): | |
try: # bad cases : '<SYNC .','<SYNC Start=479501??>' | |
li = sln[lineIndex]['start'] | |
li1 = sln[lineIndex+1]['start'] | |
except : | |
#print ml[lang][lineIndex] | |
li = None | |
li1 = None | |
if lineIndex + 1 < len(sln) and not li == None and not li1 == None: | |
tcstart = ms2timecode(int(re.sub(r'\..*$', '', item['start']))) | |
tcend = ms2timecode(int(re.sub(r'\..*$', '', sln[lineIndex+1]['start']))) | |
pTag = item.find('p')# <SYNC Start=41991><P Class=KRCC><SYNC Start=43792><P Class=KRCC> | |
if not pTag: | |
continue | |
br = pTag.findAll('br') | |
for gg in br: | |
gg.replaceWith('\N') | |
bold = pTag.findAll('b') | |
for bo in bold: | |
if len(bo.text) != 0: | |
boldre = '{\\b1}'+bo.text+'{\\b0}' | |
bo.replaceWith(boldre) | |
else: | |
bo.extract() | |
italics = pTag.findAll('i') | |
for it in italics: | |
if len(it.text) != 0: | |
itre = '{\\i1}'+it.text+'{\\i0}' | |
it.replaceWith(itre) | |
else: | |
it.extract() | |
underlines = pTag.findAll('u') | |
for un in underlines: | |
if len(un.text) != 0: | |
unre = '{\\u1}'+un.text+'{\\u0}' | |
un.replaceWith(unre) | |
else: | |
un.extract() | |
strikes = pTag.findAll('s') | |
for st in strikes: | |
if len(st.text) != 0: | |
stre = '{\\s1}'+st.text+'{\\s0}' | |
st.replaceWith(stre) | |
else: | |
st.extract() | |
rubyTags = pTag.findAll('rt') | |
for rt in rubyTags: | |
if len(rt.text) != 0: | |
rtre = '{\\fscx50}{\\fscy50} '+rt.text+' {\\fscx100}{\\fscy100}' | |
rt.replaceWith(rtre) | |
else: | |
rt.extract() | |
colors = pTag.findAll('font') | |
for color in colors: | |
try: # bad cases : '<font size=30>' | |
col = color['color'] | |
except: | |
col = None | |
if not col == None: | |
hexcolor = re.search('[0-9a-fA-F]{6}',color['color'].lower()) # bad cases : '23df34' | |
if hexcolor is not None: | |
colorCovt = '{\c&H' + hexcolor.group(0)[::-1]+'&}'+ color.text + '{\c}' | |
else: | |
try: | |
colorCovt = '{\c&H' + css3_names_to_hex[color['color'].lower()][::-1].replace('#','&}')+ color.text + '{\c}' | |
except: # bad cases : 'skybule' | |
colorCovt = color.text | |
print color['color'].lower() | |
color.replaceWith(colorCovt) | |
contents = pTag.text | |
contents = re.sub(r'smi2ass_unicode\(([0-9]+)\)', r'&#\1;', contents) | |
contents = htmlParser.unescape(contents); | |
if len(contents.strip()) != 0: | |
line = 'Dialogue: 0,%s,%s,Default,,0000,0000,0000,,%s\n' % (tcstart,tcend, contents) | |
asslines.append(line) | |
return asslines | |
def toUnicodeString(aBuf): | |
# If the data starts with BOM, we know it is UTF | |
if aBuf[:3] == '\xEF\xBB\xBF': | |
# EF BB BF UTF-8 with BOM | |
result = "UTF-8" | |
elif aBuf[:2] == '\xFF\xFE': | |
# FF FE UTF-16, little endian BOM | |
result = "UTF-16LE" | |
elif aBuf[:2] == '\xFE\xFF': | |
# FE FF UTF-16, big endian BOM | |
result = "UTF-16BE" | |
elif aBuf[:4] == '\xFF\xFE\x00\x00': | |
# FF FE 00 00 UTF-32, little-endian BOM | |
result = "UTF-32LE" | |
elif aBuf[:4] == '\x00\x00\xFE\xFF': | |
# 00 00 FE FF UTF-32, big-endian BOM | |
result = "UTF-32BE" | |
elif aBuf[:4] == '\xFE\xFF\x00\x00': | |
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) | |
result = "X-ISO-10646-UCS-4-3412" | |
elif aBuf[:4] == '\x00\x00\xFF\xFE': | |
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) | |
result = "X-ISO-10646-UCS-4-2143" | |
else: | |
result = "CP949" | |
return unicode(aBuf, result.lower(), 'ignore') | |
def ms2timecode(ms): | |
hours = ms / 3600000L | |
ms -= hours * 3600000L | |
minutes = ms / 60000L | |
ms -= minutes * 60000L | |
seconds = ms / 1000L | |
ms -= seconds * 1000L | |
ms = round(ms/10) | |
timecode = '%01d:%02d:%02d.%02d' % (hours, minutes, seconds, ms) | |
return timecode | |
def multiLanguageSeperation(smiLines): | |
#prepare multilanguage dict with languages separated list | |
multiLanguageDict = defaultdict(list) | |
#loop for number of smi subtitle lines | |
for lineIndex, subtitleLine in enumerate(smiLines): | |
#print lineIndex | |
#get time code from start tag | |
try: | |
timeCode = int(re.sub(r'\..*$', '', subtitleLine['start'])) | |
except: | |
print subtitleLine | |
#get language name from p tag | |
try: | |
languageTag = subtitleLine.find('p')['class'] | |
except: | |
print subtitleLine | |
# seperate langs depending on p class (language tag) | |
# put smiLine, Line Index, and time code into list (ml is dictionary (key is language name from p tag) with lists) | |
try: | |
multiLanguageDict[languageTag].append([subtitleLine,lineIndex,timeCode]) | |
except: # bad cases : '<SYNC Start=7630><P>' | |
try: # if no p class name, add unknown as language tag and handle later | |
#languageTag = smiLines[lineIndex-1].find('p')['class'] | |
multiLanguageDict['unknown'].append([subtitleLine,lineIndex,timeCode]) | |
except: | |
pass | |
# check whether proper multiple language subtitle | |
# if one language is less than 10% of the other language, | |
# it is likely that misuse of class name | |
# so combine or get rid of them | |
# get number of lines for each langauge and sort with number of lines | |
langcodes = multiLanguageDict.keys() | |
langcount=[] | |
for lang in langcodes: | |
langcount.append([lang, len(multiLanguageDict[lang])]) | |
langcount = sorted(langcount, key=itemgetter(1)) | |
# calculate % of each language from largest, put it in langcount | |
languageTagCheckFlag = 0 | |
for index, lang in enumerate(langcount): | |
portion = float(len(multiLanguageDict[lang[0]]))/float(langcount[len(langcount)-1][1]) | |
langcount[index].insert(2,float(len(multiLanguageDict[lang[0]]))/float(langcount[len(langcount)-1][1])) | |
try: | |
langName = langCode[langcount[index][0].upper()] | |
langCnvt = 1 | |
except: | |
langName = langcount[index][0].upper() | |
langCnvt = 0 | |
langcount[index].insert(3,langName) | |
langcount[index].insert(4,langCnvt) | |
if portion < 0.1: | |
langcount[index].insert(5,1) | |
languageTagCheckFlag = languageTagCheckFlag +1 | |
else: | |
langcount[index].insert(5,0) | |
# if there is a language with less than 10%, only two language exist than combine them | |
if languageTagCheckFlag > 0 and len(langcount) == 2: | |
tempml = multiLanguageDict[langcount[0][0]] | |
for tr in tempml: | |
multiLanguageDict[langcount[1][0]].append(tr) | |
del multiLanguageDict[langcount[0][0]] | |
# covert to real language name and merge to largest | |
elif languageTagCheckFlag > 1 : | |
for index, langc in enumerate(langcount): | |
if langc[5] == 1 and langc[4] == 1: # less than 10% and coverted to real lang name | |
toBeMergedLangName = langc[3] | |
# find largest one with same language name | |
for lg in range(len(langcount)-1,0, -1): | |
if langcount[lg][3] == toBeMergedLangName: | |
largestSameName = lg | |
break | |
# merge to largest | |
tempml = multiLanguageDict[langcount[index][0]] | |
for tr in tempml: | |
multiLanguageDict[langcount[largestSameName][0]].append(tr) | |
del multiLanguageDict[langcount[index][0]] | |
# if p language Tag is not coverted to real language name, just get rid of it. | |
elif langc[5] == 1 and langc[4] == 0: | |
del multiLanguageDict[langcount[index][0]] | |
#good to sort based on timecode before processing | |
multiLanguageDictSorted = defaultdict(list) | |
for lng in multiLanguageDict: | |
temp_ml = sorted(multiLanguageDict[lng], key=itemgetter(2)) | |
for te in temp_ml: | |
multiLanguageDictSorted[lng].append(te[0]) | |
#covert p tag language to long language name for ASS file name | |
longlang=[] | |
for lang in multiLanguageDictSorted: | |
if len(multiLanguageDictSorted)>1: | |
try : | |
if langCode[lang.upper()] in longlang: | |
longlang.append(lang) | |
else: | |
longlang.append(langCode[lang.upper()]) | |
except: | |
longlang.append(lang) | |
else: | |
longlang.append('') | |
return multiLanguageDictSorted, longlang | |
if __name__ == '__main__': | |
smiPath = sys.argv[1] | |
smi_file = open(smiPath, 'r') | |
smi_sgml = smi_file.read() | |
smi_file.close() | |
assDict = smi2ass(smi_sgml) | |
for lang in assDict: | |
if len(lang) == 0: | |
assPath = smiPath[:smiPath.rfind('.')]+'.' + defaultLangCode + '.ass' | |
else: | |
assPath = smiPath[:smiPath.rfind('.')]+'.'+lang+'.ass' | |
assfile = open(assPath, "w") | |
assfile.write(assDict[lang]) | |
assfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: A newer version of this script is located at https://github.com/trustin/smi2ass