Created
July 10, 2024 04:46
-
-
Save titipata/efd0d7a1836a6389acfdc4720ca7c0ad to your computer and use it in GitHub Desktop.
Thonburian Whisper demo with Gradio > 4.0.0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import gradio as gr | |
import pytube as pt | |
from transformers import pipeline | |
from huggingface_hub import model_info | |
MODEL_NAME = "biodatlab/whisper-th-medium-combined" # this always needs to stay in line 8 :D sorry for the hackiness | |
lang = "th" | |
device = 0 if torch.cuda.is_available() else "cpu" | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME, | |
chunk_length_s=30, | |
device=device, | |
) | |
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") | |
def transcribe(microphone, file_upload): | |
warn_output = "" | |
if (microphone is not None) and (file_upload is not None): | |
warn_output = ( | |
"WARNING: You've uploaded an audio file and used the microphone. " | |
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
) | |
elif (microphone is None) and (file_upload is None): | |
return "ERROR: You have to either use the microphone or upload an audio file" | |
file = microphone if microphone is not None else file_upload | |
text = pipe(file, generate_kwargs={"language":"<|th|>", "task":"transcribe"}, batch_size=16)["text"] | |
return warn_output + text | |
def _return_yt_html_embed(yt_url): | |
video_id = yt_url.split("?v=")[-1] | |
HTML_str = ( | |
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>' | |
" </center>" | |
) | |
return HTML_str | |
def yt_transcribe(yt_url): | |
yt = pt.YouTube(yt_url) | |
html_embed_str = _return_yt_html_embed(yt_url) | |
stream = yt.streams.filter(only_audio=True)[0] | |
stream.download(filename="audio.mp3") | |
text = pipe("audio.mp3", generate_kwargs={"language":"<|th|>", "task":"transcribe"}, batch_size=16)["text"] | |
return html_embed_str, text | |
with gr.Blocks(theme=gr.themes.HuggingFace()) as demo: | |
gr.Markdown(f"# Whisper Demo Thai 🇹🇭") | |
with gr.Tab("Transcribe Audio"): | |
gr.Markdown( | |
f"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the fine-tuned" | |
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
f" of arbitrary length." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
audio_mic = gr.Audio(source="microphone", type="filepath", label="Microphone Input") | |
audio_file = gr.Audio(source="upload", type="filepath", label="Audio File Upload") | |
with gr.Column(): | |
text_output = gr.Textbox(label="Transcription Output") | |
transcribe_btn = gr.Button("Transcribe") | |
transcribe_btn.click(fn=transcribe, inputs=[audio_mic, audio_file], outputs=text_output) | |
with gr.Tab("Transcribe YouTube"): | |
gr.Markdown( | |
f"Transcribe long-form YouTube videos with the click of a button! Demo uses the fine-tuned checkpoint:" | |
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of" | |
f" arbitrary length." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
yt_url_input = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL") | |
with gr.Column(): | |
yt_html_output = gr.HTML(label="Video") | |
yt_text_output = gr.Textbox(label="Transcription Output") | |
yt_transcribe_btn = gr.Button("Transcribe YouTube Video") | |
yt_transcribe_btn.click(fn=yt_transcribe, inputs=yt_url_input, outputs=[yt_html_output, yt_text_output]) | |
demo.launch(enable_queue=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment