GabrielSGoncalves · September 24, 2019 14:29
diff --git a/nlp_aws_medium_part1.py b/nlp_aws_medium_part1.py
 from __future__ import print_function
 import boto3
 import os
 import time
 import pandas as pd
 import matplotlib as plt
 import logging
 from botocore.exceptions import ClientError
 from datetime import date
 import json
 import seaborn as sns
 import spacy
 import wmd

 # 1) Create a dictionary with the URLs for each speech on Youtube
 dict_urls_youtube = {
    'churchill': 'https://www.youtube.com/watch?v=s_LncVnecLA',
    'reagan': 'https://www.youtube.com/watch?v=5MDFX-dNtsM&t=6s',
    'luther_king': 'https://www.youtube.com/watch?v=I47Y6VHc3Ms',
    'macarthur': 'https://www.youtube.com/watch?v=_42_aLGkRpg&t=19s',
    'kennedy': 'https://www.youtube.com/watch?v=QAmHcdwKgtQ'}


 # 2) Download each speech in mp3 format using Youtube-DL
 for url in list(dict_urls_youtube.values()):
    print(f'Downloading audio file from the link: {url}')
    os.system(f'youtube-dl -x --audio-format "mp3" {url}')


 # 3) Create a Dataframe to store the information trough the analysis
 df_audio = pd.DataFrame.from_dict(dict_urls_youtube, 
 	                              orient='index', 
 	                              columns=['youtube_urls'])


 # 4) Linking the name of each audio file to the speaker
 for audio_file in os.listdir('.'):
    if 'churchill' in audio_file.lower():
        df_audio.at['churchill','filename'] = audio_file
    elif 'reagan' in audio_file.lower():
        df_audio.at['reagan', 'filename'] = audio_file
    elif 'king' in audio_file.lower():
        df_audio.at['luther_king', 'filename'] = audio_file
    elif 'macarthur' in audio_file.lower():
        df_audio.at['macarthur', 'filename'] = audio_file
    elif 'kennedy' in audio_file.lower():
        df_audio.at['kennedy', 'filename'] = audio_file
	from __future__ import print_function
	import boto3
	import os
	import time
	import pandas as pd
	import matplotlib as plt
	import logging
	from botocore.exceptions import ClientError
	from datetime import date
	import json
	import seaborn as sns
	import spacy
	import wmd

	# 1) Create a dictionary with the URLs for each speech on Youtube
	dict_urls_youtube = {
	'churchill': 'https://www.youtube.com/watch?v=s_LncVnecLA',
	'reagan': 'https://www.youtube.com/watch?v=5MDFX-dNtsM&t=6s',
	'luther_king': 'https://www.youtube.com/watch?v=I47Y6VHc3Ms',
	'macarthur': 'https://www.youtube.com/watch?v=_42_aLGkRpg&t=19s',
	'kennedy': 'https://www.youtube.com/watch?v=QAmHcdwKgtQ'}


	# 2) Download each speech in mp3 format using Youtube-DL
	for url in list(dict_urls_youtube.values()):
	print(f'Downloading audio file from the link: {url}')
	os.system(f'youtube-dl -x --audio-format "mp3" {url}')


	# 3) Create a Dataframe to store the information trough the analysis
	df_audio = pd.DataFrame.from_dict(dict_urls_youtube,
	orient='index',
	columns=['youtube_urls'])


	# 4) Linking the name of each audio file to the speaker
	for audio_file in os.listdir('.'):
	if 'churchill' in audio_file.lower():
	df_audio.at['churchill','filename'] = audio_file
	elif 'reagan' in audio_file.lower():
	df_audio.at['reagan', 'filename'] = audio_file
	elif 'king' in audio_file.lower():
	df_audio.at['luther_king', 'filename'] = audio_file
	elif 'macarthur' in audio_file.lower():
	df_audio.at['macarthur', 'filename'] = audio_file
	elif 'kennedy' in audio_file.lower():
	df_audio.at['kennedy', 'filename'] = audio_file