-
-
Save eunicode/43cac5a60faf71f342b331a89719c107 to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Creates readable text file from SRT file. | |
""" | |
import re, sys | |
def is_time_stamp(l): | |
if l[:2].isnumeric() and l[2] == ':': | |
return True | |
return False | |
def has_letters(line): | |
if re.search('[a-zA-Z]', line): | |
return True | |
return False | |
def has_no_text(line): | |
l = line.strip() | |
if not len(l): | |
return True | |
if l.isnumeric(): | |
return True | |
if is_time_stamp(l): | |
return True | |
if l[0] == '(' and l[-1] == ')': | |
return True | |
if not has_letters(line): | |
return True | |
return False | |
def is_lowercase_letter_or_comma(letter): | |
if letter.isalpha() and letter.lower() == letter: | |
return True | |
if letter == ',': | |
return True | |
return False | |
def clean_up(lines): | |
""" | |
Get rid of all non-text lines and | |
try to combine text broken into multiple lines | |
""" | |
new_lines = [] | |
for line in lines[1:]: | |
if has_no_text(line): | |
continue | |
elif len(new_lines) and is_lowercase_letter_or_comma(line[0]): | |
#combine with previous line | |
new_lines[-1] = new_lines[-1].strip() + ' ' + line | |
else: | |
#append line | |
new_lines.append(line) | |
return new_lines | |
def main(args): | |
""" | |
args[1]: file name | |
args[2]: encoding. Default: utf-8. | |
- If you get a lot of [?]s replacing characters, | |
- you probably need to change file_encoding to 'cp1252' | |
""" | |
file_name = args[1] | |
file_encoding = 'utf-8' if len(args) < 3 else args[2] | |
with open(file_name, encoding=file_encoding, errors='replace') as f: | |
lines = f.readlines() | |
new_lines = clean_up(lines) | |
new_file_name = file_name[:-4] + '.txt' | |
with open(new_file_name, 'w') as f: | |
for line in new_lines: | |
f.write(line) | |
if __name__ == '__main__': | |
main(sys.argv) | |
""" | |
NOTES | |
* Run from command line as | |
** python srt_to_txt.py file_name.srt cp1252 | |
* Creates file_name.txt with extracted text from file_name.srt | |
* Script assumes that lines beginning with lowercase letters or commas | |
* are part of the previous line and lines beginning with any other character | |
* are new lines. This won't always be correct. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To use:
python3 srt_to_txt.py <file_name.srt> <cp1252>
cp1252
is the encoding