Last active
September 24, 2019 14:29
-
-
Save GabrielSGoncalves/2997b36db55763de8ff6d0c292c1f268 to your computer and use it in GitHub Desktop.
First part of the NLP analysis for the Medium article on AWS ML/AI tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import boto3 | |
import os | |
import time | |
import pandas as pd | |
import matplotlib as plt | |
import logging | |
from botocore.exceptions import ClientError | |
from datetime import date | |
import json | |
import seaborn as sns | |
import spacy | |
import wmd | |
# 1) Create a dictionary with the URLs for each speech on Youtube | |
dict_urls_youtube = { | |
'churchill': 'https://www.youtube.com/watch?v=s_LncVnecLA', | |
'reagan': 'https://www.youtube.com/watch?v=5MDFX-dNtsM&t=6s', | |
'luther_king': 'https://www.youtube.com/watch?v=I47Y6VHc3Ms', | |
'macarthur': 'https://www.youtube.com/watch?v=_42_aLGkRpg&t=19s', | |
'kennedy': 'https://www.youtube.com/watch?v=QAmHcdwKgtQ'} | |
# 2) Download each speech in mp3 format using Youtube-DL | |
for url in list(dict_urls_youtube.values()): | |
print(f'Downloading audio file from the link: {url}') | |
os.system(f'youtube-dl -x --audio-format "mp3" {url}') | |
# 3) Create a Dataframe to store the information trough the analysis | |
df_audio = pd.DataFrame.from_dict(dict_urls_youtube, | |
orient='index', | |
columns=['youtube_urls']) | |
# 4) Linking the name of each audio file to the speaker | |
for audio_file in os.listdir('.'): | |
if 'churchill' in audio_file.lower(): | |
df_audio.at['churchill','filename'] = audio_file | |
elif 'reagan' in audio_file.lower(): | |
df_audio.at['reagan', 'filename'] = audio_file | |
elif 'king' in audio_file.lower(): | |
df_audio.at['luther_king', 'filename'] = audio_file | |
elif 'macarthur' in audio_file.lower(): | |
df_audio.at['macarthur', 'filename'] = audio_file | |
elif 'kennedy' in audio_file.lower(): | |
df_audio.at['kennedy', 'filename'] = audio_file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment