Skip to content

Instantly share code, notes, and snippets.

@ndunn219
Last active October 2, 2024 09:48
Show Gist options
  • Save ndunn219/62263ce1fb59fda08656be7369ce329b to your computer and use it in GitHub Desktop.
Save ndunn219/62263ce1fb59fda08656be7369ce329b to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
"""
Creates readable text file from SRT file.
"""
import re, sys
def is_time_stamp(l):
if l[:2].isnumeric() and l[2] == ':':
return True
return False
def has_letters(line):
if re.search('[a-zA-Z]', line):
return True
return False
def has_no_text(line):
l = line.strip()
if not len(l):
return True
if l.isnumeric():
return True
if is_time_stamp(l):
return True
if l[0] == '(' and l[-1] == ')':
return True
if not has_letters(line):
return True
return False
def is_lowercase_letter_or_comma(letter):
if letter.isalpha() and letter.lower() == letter:
return True
if letter == ',':
return True
return False
def clean_up(lines):
"""
Get rid of all non-text lines and
try to combine text broken into multiple lines
"""
new_lines = []
for line in lines[1:]:
if has_no_text(line):
continue
elif len(new_lines) and is_lowercase_letter_or_comma(line[0]):
#combine with previous line
new_lines[-1] = new_lines[-1].strip() + ' ' + line
else:
#append line
new_lines.append(line)
return new_lines
def main(args):
"""
args[1]: file name
args[2]: encoding. Default: utf-8.
- If you get a lot of [?]s replacing characters,
- you probably need to change file_encoding to 'cp1252'
"""
file_name = args[1]
file_encoding = 'utf-8' if len(args) < 3 else args[2]
with open(file_name, encoding=file_encoding, errors='replace') as f:
lines = f.readlines()
new_lines = clean_up(lines)
new_file_name = file_name[:-4] + '.txt'
with open(new_file_name, 'w') as f:
for line in new_lines:
f.write(line)
if __name__ == '__main__':
main(sys.argv)
"""
NOTES
* Run from command line as
** python srt_to_txt.py file_name.srt cp1252
* Creates file_name.txt with extracted text from file_name.srt
* Script assumes that lines beginning with lowercase letters or commas
* are part of the previous line and lines beginning with any other character
* are new lines. This won't always be correct.
"""
@shuantsu-zz
Copy link

I adapted the script to use on mine. Thanks a lot!

https://github.com/shuantsu/srt_extract

@eunicode
Copy link

eunicode commented Jul 9, 2021

run your code like:
python srt_to_txt.py file_name.srt cp1252

Thank you so much for this clarifying comment!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment