Created
June 22, 2019 18:49
-
-
Save shotasenga/e5f6951ac194dcffcb24c36920e86463 to your computer and use it in GitHub Desktop.
Create transcript from MP3 file by using GCP Speech to text API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import wave | |
import tempfile | |
from pydub import AudioSegment | |
from google.cloud import speech | |
from google.cloud.speech import enums | |
from google.cloud.speech import types | |
WORK_DIR = tempfile.mkdtemp() | |
def mp3_to_text(fpath): | |
""" | |
Text from a wave file by using GCP Speach to text API | |
Params: | |
(str) fpath: path to the mp3 file | |
Returns: | |
(str[]) recognized texts | |
""" | |
wav_path = mp3_to_wav(fpath) | |
return wav_to_text(wav_path) | |
def wav_to_text(fpath): | |
""" | |
Text from a wave file by using GCP Speach to text API | |
Params: | |
(str) fpath: path to the mp3 file | |
Returns: | |
(str[]) recognized texts | |
""" | |
frame_rate = get_frame_rate(fpath) | |
client = speech.SpeechClient() | |
with open(fpath, 'rb') as fp: | |
content = fp.read() | |
audio = types.RecognitionAudio(content=content) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, | |
sample_rate_hertz=frame_rate, | |
language_code='en-US' | |
) | |
response = client.recognize(config, audio) | |
text = [] | |
for result in response.results: | |
text.append(result.alternatives[0].transcript) | |
return text | |
def get_frame_rate(fpath): | |
""" | |
Get frame rate of a wave file | |
Params: | |
(str) fpath: path to a wav file | |
Returns: | |
(int) the frame rate | |
""" | |
frame_rate = None | |
with wave.open(fpath, 'rb') as fp: | |
frame_rate = fp.getframerate() | |
return frame_rate | |
def mp3_to_wav(fpath): | |
""" | |
Convert mp3 to wav and returns the path of the created file | |
Params: | |
(str) fpath: path to the mp3 file | |
Returns: | |
(str) Created wav file path | |
""" | |
basename = os.path.basename(fpath) | |
new_fpath = WORK_DIR + re.sub(r"\.mp3$", ".wav", basename) | |
sound = AudioSegment.from_mp3(fpath) | |
sound = sound.set_channels(1) | |
sound.export(new_fpath, format="wav", parameters=["-ac", "1"]) | |
return new_fpath | |
if __name__ == "__main__": | |
print(mp3_to_text("audio/002/2.mp3")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[[source]] | |
name = "pypi" | |
url = "https://pypi.org/simple" | |
verify_ssl = true | |
[dev-packages] | |
[packages] | |
google-cloud-speech = "*" | |
pydub = "*" | |
[requires] | |
python_version = "3.7" |
Author
shotasenga
commented
Jun 22, 2019
- https://cloud.google.com/speech-to-text/docs/reference/libraries
- https://towardsdatascience.com/how-to-use-google-speech-to-text-api-to-transcribe-long-audio-files-1c886f4eb3e9
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment