Created
December 6, 2019 17:23
-
-
Save iandow/c603fb356e768ea09e07e57f7df2dc33 to your computer and use it in GitHub Desktop.
Revisions
-
iandow created this gist
Dec 6, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,30 @@ # Tell the NLTK data loader to look for resource files in /tmp/ nltk.data.path.append("/tmp/") # Download NLTK tokenizers to /tmp/ # We use /tmp because that's where AWS Lambda provides write access to the local file system. nltk.download('punkt', download_dir='/tmp/') # Load the English language tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Split input text into a list of sentences sentences = tokenizer.tokenize(transcript) print("Input text length: " + str(len(transcript))) print("Number of sentences: " + str(len(sentences))) translated_text = '' source_text_chunk = '' for sentence in sentences: # Translate can handle 5000 unicode characters but we'll process no more than 4000 # just to be on the safe side. if (len(sentence) + len(source_text_chunk) < 4000): source_text_chunk = source_text_chunk + ' ' + sentence else: print("Translation input text length: " + str(len(source_text_chunk))) translation_chunk = translate_client.translate_text(Text=source_text_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) translated_text = translated_text + ' ' + translation_chunk["TranslatedText"] source_text_chunk = sentence # Translate the final chunk of input text print("Translation input text length: " + str(len(source_text_chunk))) translation_chunk = translate_client.translate_text(Text=source_text_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) translated_text = translated_text + ' ' + translation_chunk["TranslatedText"] print("Final translation text length: " + str(len(translated_text)))