Created
December 6, 2019 17:23
-
-
Save iandow/c603fb356e768ea09e07e57f7df2dc33 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tell the NLTK data loader to look for resource files in /tmp/ | |
nltk.data.path.append("/tmp/") | |
# Download NLTK tokenizers to /tmp/ | |
# We use /tmp because that's where AWS Lambda provides write access to the local file system. | |
nltk.download('punkt', download_dir='/tmp/') | |
# Load the English language tokenizer | |
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
# Split input text into a list of sentences | |
sentences = tokenizer.tokenize(transcript) | |
print("Input text length: " + str(len(transcript))) | |
print("Number of sentences: " + str(len(sentences))) | |
translated_text = '' | |
source_text_chunk = '' | |
for sentence in sentences: | |
# Translate can handle 5000 unicode characters but we'll process no more than 4000 | |
# just to be on the safe side. | |
if (len(sentence) + len(source_text_chunk) < 4000): | |
source_text_chunk = source_text_chunk + ' ' + sentence | |
else: | |
print("Translation input text length: " + str(len(source_text_chunk))) | |
translation_chunk = translate_client.translate_text(Text=source_text_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang) | |
print("Translation output text length: " + str(len(translation_chunk))) | |
translated_text = translated_text + ' ' + translation_chunk["TranslatedText"] | |
source_text_chunk = sentence | |
# Translate the final chunk of input text | |
print("Translation input text length: " + str(len(source_text_chunk))) | |
translation_chunk = translate_client.translate_text(Text=source_text_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang) | |
print("Translation output text length: " + str(len(translation_chunk))) | |
translated_text = translated_text + ' ' + translation_chunk["TranslatedText"] | |
print("Final translation text length: " + str(len(translated_text))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment