Last active
September 18, 2019 16:41
-
-
Save GabrielSGoncalves/adf68fd40ae8b4b9e8644a8f80e02e4e to your computer and use it in GitHub Desktop.
Second part of the NLP analysis for the Medium article on AWS ML/AI tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 5) Creating a new S3 bucket to upload the audio files | |
bucket_name = 'medium-nlp-aws' | |
client_s3 = boto3.client('s3') | |
client_s3.create_bucket(Bucket=bucket_name) | |
# 6) Uploading the files to the created bucket | |
for audio_file in df_audio.filename.values: | |
print(audio_file) | |
client_s3.upload_file(audio_file, bucket_name, audio_file) | |
# 7) Define the file URLs on the bucket using S3 convention for file paths | |
for index, row in df_audio.iterrows(): | |
bucket_location = boto3.client('s3').get_bucket_location(Bucket=bucket_name) | |
object_url = f"https://{bucket_name}.s3.amazonaws.com/{row['filename'].replace(' ', '+')}" | |
df_audio.at[index, 'url'] = object_url | |
print(object_url) | |
# 8) Function to start Amazon Transcribe job | |
def start_transcription(bucket, job_name, file_url, wait_process=True): | |
client_transcribe = boto3.client('transcribe') | |
client_transcribe.start_transcription_job( | |
TranscriptionJobName=job_name, | |
Media={'MediaFileUri': file_url}, | |
MediaFormat='mp3', | |
LanguageCode='en-US', | |
OutputBucketName=bucket) | |
if wait_process: | |
while True: | |
status = client_transcribe.get_transcription_job(TranscriptionJobName=job_name) | |
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']: | |
break | |
print("Not ready yet...") | |
time.sleep(20) | |
print('Transcription finished') | |
return status | |
# 9) Iterate over the audio files URLs on S3 and call start_transcription | |
today = date.today().strftime("%d%m%Y") | |
for index, row in df_audio.iterrows(): | |
print(f'{index}_speech_{today}', row.url) | |
start_transcription(bucket_name, f'{index}_speech_{today}_{var}', row.url, wait_process=False) | |
df_audio.at[index, 'transcription_url'] = f"https://{bucket_name}.s3.amazonaws.com/{index}_speech_{today}.json" | |
df_audio.at[index, 'json_transcription'] = f"{index}_speech_{today}.json" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment