Created
January 6, 2022 11:08
-
-
Save ClearlyKyle/cda863dfd2b6d4e389bf217f6b5a3181 to your computer and use it in GitHub Desktop.
SRT_Sentences_with_Spacy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import pysrt | |
| import spacy | |
| output_folder_name = "txt" | |
| try: | |
| os.mkdir("./{}".format(output_folder_name)) | |
| except OSError: | |
| pass # already exists | |
| for file in os.listdir("."): | |
| if file.endswith(".srt"): | |
| print("Converting: ", file) | |
| output_name = file.split('.')[0] + ".txt" | |
| subs = pysrt.open(file) | |
| all_text = "" | |
| # put all subtitles together without line breaks | |
| for sub in subs: | |
| # +' ' because some sentences span over multiple time stamps | |
| all_text += str(sub.text).replace('\n', ' ') + ' ' | |
| nlp = spacy.load('ru_core_news_sm') | |
| tokens = nlp(all_text) | |
| sentences = [sent.text.strip() for sent in tokens.sents] | |
| with open(os.path.join("./{}/".format(output_folder_name) + output_name), encoding='utf-8', mode='w') as output_file: | |
| output_file.writelines("%s\n" % sentence for sentence in sentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment