Last active
October 2, 2024 09:48
-
-
Save ndunn219/62263ce1fb59fda08656be7369ce329b to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Creates readable text file from SRT file. | |
""" | |
import re, sys | |
def is_time_stamp(l): | |
if l[:2].isnumeric() and l[2] == ':': | |
return True | |
return False | |
def has_letters(line): | |
if re.search('[a-zA-Z]', line): | |
return True | |
return False | |
def has_no_text(line): | |
l = line.strip() | |
if not len(l): | |
return True | |
if l.isnumeric(): | |
return True | |
if is_time_stamp(l): | |
return True | |
if l[0] == '(' and l[-1] == ')': | |
return True | |
if not has_letters(line): | |
return True | |
return False | |
def is_lowercase_letter_or_comma(letter): | |
if letter.isalpha() and letter.lower() == letter: | |
return True | |
if letter == ',': | |
return True | |
return False | |
def clean_up(lines): | |
""" | |
Get rid of all non-text lines and | |
try to combine text broken into multiple lines | |
""" | |
new_lines = [] | |
for line in lines[1:]: | |
if has_no_text(line): | |
continue | |
elif len(new_lines) and is_lowercase_letter_or_comma(line[0]): | |
#combine with previous line | |
new_lines[-1] = new_lines[-1].strip() + ' ' + line | |
else: | |
#append line | |
new_lines.append(line) | |
return new_lines | |
def main(args): | |
""" | |
args[1]: file name | |
args[2]: encoding. Default: utf-8. | |
- If you get a lot of [?]s replacing characters, | |
- you probably need to change file_encoding to 'cp1252' | |
""" | |
file_name = args[1] | |
file_encoding = 'utf-8' if len(args) < 3 else args[2] | |
with open(file_name, encoding=file_encoding, errors='replace') as f: | |
lines = f.readlines() | |
new_lines = clean_up(lines) | |
new_file_name = file_name[:-4] + '.txt' | |
with open(new_file_name, 'w') as f: | |
for line in new_lines: | |
f.write(line) | |
if __name__ == '__main__': | |
main(sys.argv) | |
""" | |
NOTES | |
* Run from command line as | |
** python srt_to_txt.py file_name.srt cp1252 | |
* Creates file_name.txt with extracted text from file_name.srt | |
* Script assumes that lines beginning with lowercase letters or commas | |
* are part of the previous line and lines beginning with any other character | |
* are new lines. This won't always be correct. | |
""" |
thanks for the help, there is problem as you open file with file_encoding
but don't save with it it will cause error
Thanks. It works perfectly!
Hi.I cannot seem to get this to work. There are no error generated, but it doesn't output the formatted file.
run your code like:
python srt_to_txt.py file_name.srt cp1252
I adapted the script to use on mine. Thanks a lot!
run your code like:
python srt_to_txt.py file_name.srt cp1252
Thank you so much for this clarifying comment!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I wrote this to create readable/printable text files to follow along with El ministerio del tiempo in Spanish. The SRT files are available at https://www.opensubtitles.org/es/ssearch/sublanguageid-spa/idmovie-192528