-
-
Save YungSang/2752897 to your computer and use it in GitHub Desktop.
#!/usr/bin/python | |
# -*- encoding:utf-8 -*- | |
"""Translate Google's Transcript into srt file. | |
Takes google's transcript filename as argument (xml extension required). | |
NB: to get google's transcript, use tihs URL: | |
http://video.google.com/timedtext?lang=en&v=VIDEO_ID | |
""" | |
# srt example | |
"""1 | |
00:00:20,672 --> 00:00:24,972 | |
Entre l’Australia et la South America, | |
dans l’Océan South Pacific…""" | |
# Google's transcript example (first tags) | |
"""<?xml version="1.0" encoding="utf-8" ?> | |
<transcript> | |
<text start="11.927" dur="2.483"> | |
This is a matter of National Security.</text>""" | |
import re, sys | |
# Pattern to identify a subtitle and grab start, duration and text. | |
pat = re.compile(r'<?text start="(\d+\.\d+)" dur="(\d+\.\d+)">(.*)</text>?', re.DOTALL) | |
def parseLine(text): | |
"""Parse a subtitle.""" | |
m = re.match(pat, text) | |
if m: | |
return (m.group(1), m.group(2), m.group(3)) | |
else: | |
return None | |
def formatSrtTime(secTime): | |
"""Convert a time in seconds (google's transcript) to srt time format.""" | |
sec, micro = str(secTime).split('.') | |
m, s = divmod(int(sec), 60) | |
h, m = divmod(m, 60) | |
return "{:02}:{:02}:{:02},{}".format(h,m,s,micro) | |
def convertHtml(text): | |
"""A few HTML encodings replacements. | |
&#39; to ' | |
&quot; to " | |
""" | |
return text.replace('&#39;', "'").replace('&quot;', '"') | |
def printSrtLine(i, elms): | |
"""Print a subtitle in srt format.""" | |
return "{}\n{} --> {}\n{}\n\n".format(i, formatSrtTime(elms[0]), formatSrtTime(float(elms[0])+float(elms[1])), convertHtml(elms[2])) | |
fileName = sys.argv[1] | |
def main(fileName): | |
"""Parse google's transcript and write the converted data in srt format.""" | |
with open(sys.argv[1], 'r') as infile: | |
buf = [] | |
for line in infile: | |
buf.append(line) | |
# Split the buffer to get one string per tag. | |
buf = "".join(buf).split('><') | |
i = 0 | |
srtfileName = fileName.replace('.xml', '.srt') | |
with open(srtfileName, 'w') as outfile: | |
for text in buf: | |
parsed = parseLine(text) | |
if parsed: | |
i += 1 | |
outfile.write(printSrtLine(i, parsed)) | |
print('DONE ({})'.format(srtfileName)) | |
if __name__ == "__main__": | |
main(fileName) |
The program couldn't catch 'start' and/or 'duration' values if the value was an integer so such subtitles were missing from the outfile. For example, try YouTube video with ID v2jpnyKPH64, the second subtitle has timestamp 17 (not 17.00 or 17. or 17.0 but 17) and it will be missing from output. To solve the problem, the regexps inside brackets should be changed from:
(\d+\.\d+)
to:
(\d+\.?\d*)
so the correct regular expression in the line #27 should be:
pat = re.compile(r'<?text start="(\d+\.?\d*)" dur="(\d+\.?\d*)">(.*)</text>?')
As the consequence, the split method in the line #39 will not be able to find the '.' in the case of integer timestamps (or durations) and the program will throw an exception. The solution is to insert:
secTime = float(secTime)
just before line #39 to add '.0' to the integer numbers caught by the regexps.
Convert a youtube transcript in srt subtitle « Python recipes « ActiveState Code
http://code.activestate.com/recipes/577459-convert-a-youtube-transcript-in-srt-subtitle/