Skip to content

Instantly share code, notes, and snippets.

@shotasenga
Created June 22, 2019 18:49
Show Gist options
  • Save shotasenga/e5f6951ac194dcffcb24c36920e86463 to your computer and use it in GitHub Desktop.
Save shotasenga/e5f6951ac194dcffcb24c36920e86463 to your computer and use it in GitHub Desktop.
Create transcript from MP3 file by using GCP Speech to text API
import os
import re
import wave
import tempfile
from pydub import AudioSegment
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
WORK_DIR = tempfile.mkdtemp()
def mp3_to_text(fpath):
"""
Text from a wave file by using GCP Speach to text API
Params:
(str) fpath: path to the mp3 file
Returns:
(str[]) recognized texts
"""
wav_path = mp3_to_wav(fpath)
return wav_to_text(wav_path)
def wav_to_text(fpath):
"""
Text from a wave file by using GCP Speach to text API
Params:
(str) fpath: path to the mp3 file
Returns:
(str[]) recognized texts
"""
frame_rate = get_frame_rate(fpath)
client = speech.SpeechClient()
with open(fpath, 'rb') as fp:
content = fp.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=frame_rate,
language_code='en-US'
)
response = client.recognize(config, audio)
text = []
for result in response.results:
text.append(result.alternatives[0].transcript)
return text
def get_frame_rate(fpath):
"""
Get frame rate of a wave file
Params:
(str) fpath: path to a wav file
Returns:
(int) the frame rate
"""
frame_rate = None
with wave.open(fpath, 'rb') as fp:
frame_rate = fp.getframerate()
return frame_rate
def mp3_to_wav(fpath):
"""
Convert mp3 to wav and returns the path of the created file
Params:
(str) fpath: path to the mp3 file
Returns:
(str) Created wav file path
"""
basename = os.path.basename(fpath)
new_fpath = WORK_DIR + re.sub(r"\.mp3$", ".wav", basename)
sound = AudioSegment.from_mp3(fpath)
sound = sound.set_channels(1)
sound.export(new_fpath, format="wav", parameters=["-ac", "1"])
return new_fpath
if __name__ == "__main__":
print(mp3_to_text("audio/002/2.mp3"))
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
google-cloud-speech = "*"
pydub = "*"
[requires]
python_version = "3.7"