Last active
September 24, 2019 14:31
-
-
Save GabrielSGoncalves/8e339baee85e05fc97feb8f2533151b9 to your computer and use it in GitHub Desktop.
Third part of the NLP analysis for the Medium article on AWS ML/AI tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 10) Function to get text from the JSON file generated using Amazon Transcribe | |
def get_text_from_json(bucket, key): | |
s3 = boto3.client('s3') | |
object = s3.get_object(Bucket=bucket, Key=key) | |
serializedObject = object['Body'].read() | |
data = json.loads(serializedObject) | |
return data.get('results').get('transcripts')[0].get('transcript') | |
# 11) Reading the original transcription from the JSON file | |
with open('original_transcripts.json', 'r') as f: | |
original_transcriptions = json.load(f) | |
# 12) Function to process text | |
def process_text(text): | |
""" | |
Process text by removing stop words, punctuation, | |
pronouns and performing lemmatization on tokens. | |
Parameters | |
text (str): Any given text | |
Return | |
str: Processed text | |
""" | |
doc = nlp(text.lower()) | |
result = [] | |
for token in doc: | |
if token.text in nlp.Defaults.stop_words: | |
continue | |
if token.is_punct: | |
continue | |
if token.lemma_ == '-PRON-': | |
continue | |
result.append(token.lemma_) | |
return " ".join(result) | |
# 13) Iterate over the speakers comparing the transcription texts using spaCy | |
nlp = spacy.load('en_core_web_lg') | |
for index, row in df_audio.iterrows(): | |
original_transcription = nlp(process_text(original_transcriptions.get(index))) | |
transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription))) | |
w2v_similaraty = original_transcription.similarity(transcribe_transcription) | |
df_audio.at[index, 'w2v_text_similarity'] = w2v_similaraty | |
print(f'Processed Word2vec Similiraty for {index}\'s speech: {w2v_similaraty}') | |
# 14) Iterate over the speakers to get Word Movers distance using spaCy and wmd | |
nlp = spacy.load('en_core_web_lg') | |
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True) | |
for index, row in df_audio.iterrows(): | |
original_transcription = nlp(process_text(original_transcriptions.get(index))) | |
transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription))) | |
wmd_similaraty = original_transcription.similarity(transcribe_transcription) | |
df_audio.at[index, 'wmd_similarity'] = wmd_similaraty | |
print(f'Word Movers Distance Similiraty for {index}\'s speech: {wmd_similaraty}') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment