GabrielSGoncalves · September 24, 2019 14:31
diff --git a/nlp_aws_medium_part3.py b/nlp_aws_medium_part3.py
 # 10) Function to get text from the JSON file generated using Amazon Transcribe
 def get_text_from_json(bucket, key):
    s3 = boto3.client('s3')
    object = s3.get_object(Bucket=bucket, Key=key)
    serializedObject = object['Body'].read()
    data = json.loads(serializedObject)
    return data.get('results').get('transcripts')[0].get('transcript')

 # 11) Reading the original transcription from the JSON file
 with open('original_transcripts.json', 'r') as f:
    original_transcriptions = json.load(f)

 # 12) Function to process text
 def process_text(text):
    """
    Process text by removing stop words, punctuation,
    pronouns and performing lemmatization on tokens.
    
    Parameters
    text (str): Any given text
    
    Return
    str: Processed text
    """
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)
   
    
 # 13) Iterate over the speakers comparing the transcription texts using spaCy
 nlp = spacy.load('en_core_web_lg')
 for index, row in df_audio.iterrows():
    original_transcription = nlp(process_text(original_transcriptions.get(index)))
    transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
    w2v_similaraty = original_transcription.similarity(transcribe_transcription)    
    df_audio.at[index, 'w2v_text_similarity'] = w2v_similaraty
    print(f'Processed Word2vec Similiraty for {index}\'s speech: {w2v_similaraty}')  

    
 # 14) Iterate over the speakers to get Word Movers distance using spaCy and wmd
 nlp = spacy.load('en_core_web_lg')
 nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
 for index, row in df_audio.iterrows():
    original_transcription = nlp(process_text(original_transcriptions.get(index)))
    transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
    wmd_similaraty = original_transcription.similarity(transcribe_transcription)    
    df_audio.at[index, 'wmd_similarity'] = wmd_similaraty
    print(f'Word Movers Distance Similiraty for {index}\'s speech: {wmd_similaraty}')
	# 10) Function to get text from the JSON file generated using Amazon Transcribe
	def get_text_from_json(bucket, key):
	s3 = boto3.client('s3')
	object = s3.get_object(Bucket=bucket, Key=key)
	serializedObject = object['Body'].read()
	data = json.loads(serializedObject)
	return data.get('results').get('transcripts')[0].get('transcript')

	# 11) Reading the original transcription from the JSON file
	with open('original_transcripts.json', 'r') as f:
	original_transcriptions = json.load(f)

	# 12) Function to process text
	def process_text(text):
	"""
	Process text by removing stop words, punctuation,
	pronouns and performing lemmatization on tokens.

	Parameters
	text (str): Any given text

	Return
	str: Processed text
	"""
	doc = nlp(text.lower())
	result = []
	for token in doc:
	if token.text in nlp.Defaults.stop_words:
	continue
	if token.is_punct:
	continue
	if token.lemma_ == '-PRON-':
	continue
	result.append(token.lemma_)
	return " ".join(result)


	# 13) Iterate over the speakers comparing the transcription texts using spaCy
	nlp = spacy.load('en_core_web_lg')
	for index, row in df_audio.iterrows():
	original_transcription = nlp(process_text(original_transcriptions.get(index)))
	transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
	w2v_similaraty = original_transcription.similarity(transcribe_transcription)
	df_audio.at[index, 'w2v_text_similarity'] = w2v_similaraty
	print(f'Processed Word2vec Similiraty for {index}\'s speech: {w2v_similaraty}')


	# 14) Iterate over the speakers to get Word Movers distance using spaCy and wmd
	nlp = spacy.load('en_core_web_lg')
	nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
	for index, row in df_audio.iterrows():
	original_transcription = nlp(process_text(original_transcriptions.get(index)))
	transcribe_transcription = nlp(process_text(get_text_from_json(bucket_name, row.json_transcription)))
	wmd_similaraty = original_transcription.similarity(transcribe_transcription)
	df_audio.at[index, 'wmd_similarity'] = wmd_similaraty
	print(f'Word Movers Distance Similiraty for {index}\'s speech: {wmd_similaraty}')