Skip to content

Instantly share code, notes, and snippets.

@ClearlyKyle
Created January 6, 2022 11:08
Show Gist options
  • Save ClearlyKyle/cda863dfd2b6d4e389bf217f6b5a3181 to your computer and use it in GitHub Desktop.
Save ClearlyKyle/cda863dfd2b6d4e389bf217f6b5a3181 to your computer and use it in GitHub Desktop.
SRT_Sentences_with_Spacy
import os
import pysrt
import spacy
output_folder_name = "txt"
try:
os.mkdir("./{}".format(output_folder_name))
except OSError:
pass # already exists
for file in os.listdir("."):
if file.endswith(".srt"):
print("Converting: ", file)
output_name = file.split('.')[0] + ".txt"
subs = pysrt.open(file)
all_text = ""
# put all subtitles together without line breaks
for sub in subs:
# +' ' because some sentences span over multiple time stamps
all_text += str(sub.text).replace('\n', ' ') + ' '
nlp = spacy.load('ru_core_news_sm')
tokens = nlp(all_text)
sentences = [sent.text.strip() for sent in tokens.sents]
with open(os.path.join("./{}/".format(output_folder_name) + output_name), encoding='utf-8', mode='w') as output_file:
output_file.writelines("%s\n" % sentence for sentence in sentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment