|
|
|
''' |
|
Usage, see README.md |
|
''' |
|
|
|
from dotenv import load_dotenv, find_dotenv |
|
_ = load_dotenv(find_dotenv(".env")) # read local .env file |
|
|
|
import os, base64, sys, threading, argparse |
|
import sounddevice as sd |
|
import soundfile as sf |
|
|
|
parser = argparse.ArgumentParser(description="User arguments for audio analysis") |
|
parser.add_argument('--audio', type=str, required=True, help='Your audio clip path, must be in mp3 format.') |
|
args = parser.parse_args() |
|
|
|
from openai import OpenAI |
|
client = OpenAI( |
|
api_key=os.environ.get("OPENAI_API_KEY"), |
|
base_url=os.environ.get("OPENAI_PROXY"), |
|
organization=os.environ.get("OPENAI_ORG_ID"), |
|
) |
|
|
|
if os.environ.get("OPENAI_ORG_ID") is None: |
|
print('O3 feedback: Ignored, for you did not set the OPENAI_ORG_ID in .env file.') |
|
|
|
############################################################ |
|
print(f'Input: {args.audio}') |
|
with open(args.audio, "rb") as audio_file: |
|
audio_bytes = audio_file.read() |
|
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") |
|
|
|
model = 'gpt-4o-audio-preview' |
|
print(f'Model: {model}') |
|
|
|
systemPrompt = ''' |
|
You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking." |
|
|
|
You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency. |
|
|
|
You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency. |
|
|
|
You should only output the Quick Stats, no other information. |
|
|
|
OUTPUT TEMPLATE |
|
|
|
Quick Stats |
|
• Length (sec): X |
|
• Words per Minute (≈): X |
|
• Silent Pauses ≥0.5 s (count): X |
|
• Fillers (“uh/um/like”) (count): X |
|
• IELTS Score (4-9): X |
|
• CELPIP Score (4-12): X |
|
''' |
|
response = client.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{"role": "system", "content": systemPrompt}, |
|
{"role": "user", "content": [ |
|
{"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."}, |
|
{"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }}, |
|
]} |
|
], |
|
modalities=["text"], |
|
temperature=1, |
|
max_completion_tokens=2048, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
text = response.choices[0].message.content |
|
print(f'\n{text}') |
|
|
|
############################################################ |
|
model = 'gpt-4o-audio-preview' |
|
systemPrompt = ''' |
|
You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking." |
|
|
|
You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency. |
|
|
|
You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency. |
|
|
|
Your feedback is specific and targeted, helping users improve their speaking ability in real communication. You provide only one most important piece of feedback. |
|
|
|
For each of your feedback, you must give what I said as examples, should never give feedback without example. Then you should provide examples for how to improve it. |
|
|
|
Your feedback should focus on fluency, not on accuracy, grammar, or vocabulary. |
|
|
|
You should only output the Most Impactful Fluency Issue, no other information. |
|
|
|
OUTPUT TEMPLATE |
|
|
|
Most Impactful Fluency Issue |
|
Issue Type: “…” |
|
Example: “…” |
|
Better version: “…” |
|
''' |
|
response = client.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{"role": "system", "content": systemPrompt}, |
|
{"role": "user", "content": [ |
|
{"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."}, |
|
{"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }}, |
|
]} |
|
], |
|
modalities=["text", "audio"], |
|
audio= { |
|
"voice": "alloy", |
|
"format": "wav" |
|
}, |
|
temperature=1, |
|
max_completion_tokens=2048, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
text = response.choices[0].message.audio.transcript |
|
print(f'\n{text}') |
|
|
|
audio_data = response.choices[0].message.audio.data |
|
audio_bytes = base64.b64decode(audio_data) |
|
|
|
with open("response.wav", "wb") as out_file: |
|
out_file.write(audio_bytes) |
|
|
|
def play_audio(file_path): |
|
data, fs = sf.read(file_path, dtype='float32') |
|
sd.play(data, fs) |
|
sd.wait() |
|
|
|
play_thread = threading.Thread(target=play_audio, args=('./response.wav',)) |
|
play_thread.start() |
|
|
|
############################################################ |
|
def o3_feedback(): |
|
if os.environ.get("OPENAI_ORG_ID") is None: |
|
return |
|
|
|
print(f'\nO3 Feedback:') |
|
|
|
text=''' |
|
Most Impactful Fluency Issue |
|
Issue Type: Hesitation and Repetition |
|
Example: "We visited, uh, Niagara Falls because, uh, it is very near the, uh, retreat event." |
|
Better version: “We visited Niagara Falls because it's very near the retreat event.” |
|
''' |
|
# Use O3 to analyze the result. |
|
model = 'o3' |
|
systemPrompt = ''' |
|
You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking." |
|
|
|
You will help me about the 4/3/2 training. |
|
|
|
You will response in less than 300 words. |
|
|
|
Please answer in Chinese. |
|
''' |
|
response = client.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{"role": "system", "content": systemPrompt}, |
|
{"role": "user", "content": [ |
|
{"type": "text", "text": f"{text}"}, |
|
{"type": "text", "text": "I received feedback from the audio analysis. Is this feedback reasonable, and should this issue be the main focus?"}, |
|
]} |
|
], |
|
response_format={"type": "text"}, |
|
reasoning_effort="medium", |
|
stream=True, |
|
) |
|
|
|
for chunk in response: |
|
content = chunk.choices[0].delta.content |
|
if content: |
|
sys.stdout.write(content) |
|
sys.stdout.flush() |
|
|
|
o3_thread = threading.Thread(target=o3_feedback, args=()) |
|
o3_thread.start() |
|
|
|
############################################################ |
|
play_thread.join() |
|
o3_thread.join() |
|
print('') |