winlinvip · June 8, 2025 12:36
diff --git a/.env b/.env
 # Required.
 OPENAI_API_KEY=your-key
 # Optional. For O3 feedback, you need to verify your orgnization at https://platform.openai.com/settings/organization/general.
 OPENAI_ORG_ID=your-verified-orgnization
 # Optional.
 OPENAI_PROXY=https://api.openai.com/v1
diff --git a/README.md b/README.md
diff --git a/main.py b/main.py

 '''
 Usage, see README.md
 '''

 from dotenv import load_dotenv, find_dotenv
 _ = load_dotenv(find_dotenv(".env")) # read local .env file

 import os, base64, sys, threading, argparse
 import sounddevice as sd
 import soundfile as sf

 parser = argparse.ArgumentParser(description="User arguments for audio analysis")
 parser.add_argument('--audio', type=str, required=True, help='Your audio clip path, must be in mp3 format.')
 args = parser.parse_args()

 from openai import OpenAI
 client = OpenAI(
  api_key=os.environ.get("OPENAI_API_KEY"),
  base_url=os.environ.get("OPENAI_PROXY"),
  organization=os.environ.get("OPENAI_ORG_ID"),
 )

 if os.environ.get("OPENAI_ORG_ID") is None:
  print('O3 feedback: Ignored, for you did not set the OPENAI_ORG_ID in .env file.')

 ############################################################
 print(f'Input: {args.audio}')
 with open(args.audio, "rb") as audio_file:
    audio_bytes = audio_file.read()
    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")

 model = 'gpt-4o-audio-preview'
 print(f'Model: {model}')

 systemPrompt = '''
 You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking."

 You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency.

 You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency.

 You should only output the Quick Stats, no other information.

 OUTPUT TEMPLATE

 Quick Stats
 • Length (sec): X
 • Words per Minute (≈): X
 • Silent Pauses ≥0.5 s (count): X
 • Fillers (“uh/um/like”) (count): X
 • IELTS Score (4-9): X
 • CELPIP Score (4-12): X
 '''
 response = client.chat.completions.create(
  model=model,
  messages=[
    {"role": "system", "content": systemPrompt},
    {"role": "user", "content": [
        {"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."},
        {"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }},
    ]}
  ],
  modalities=["text"],
  temperature=1,
  max_completion_tokens=2048,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
 )

 text = response.choices[0].message.content
 print(f'\n{text}')

 ############################################################
 model = 'gpt-4o-audio-preview'
 systemPrompt = '''
 You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking."

 You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency.

 You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency.

 Your feedback is specific and targeted, helping users improve their speaking ability in real communication. You provide only one most important piece of feedback.

 For each of your feedback, you must give what I said as examples, should never give feedback without example. Then you should provide examples for how to improve it.

 Your feedback should focus on fluency, not on accuracy, grammar, or vocabulary.

 You should only output the Most Impactful Fluency Issue, no other information.

 OUTPUT TEMPLATE

 Most Impactful Fluency Issue
 Issue Type: “…”
 Example: “…”
 Better version: “…”
 '''
 response = client.chat.completions.create(
  model=model,
  messages=[
    {"role": "system", "content": systemPrompt},
    {"role": "user", "content": [
        {"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."},
        {"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }},
    ]}
  ],
  modalities=["text", "audio"],
  audio= {
    "voice": "alloy",
    "format": "wav"
  },
  temperature=1,
  max_completion_tokens=2048,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
 )

 text = response.choices[0].message.audio.transcript
 print(f'\n{text}')

 audio_data = response.choices[0].message.audio.data
 audio_bytes = base64.b64decode(audio_data)

 with open("response.wav", "wb") as out_file:
    out_file.write(audio_bytes)

 def play_audio(file_path):
    data, fs = sf.read(file_path, dtype='float32')
    sd.play(data, fs)
    sd.wait()

 play_thread = threading.Thread(target=play_audio, args=('./response.wav',))
 play_thread.start()

 ############################################################
 def o3_feedback():
  if os.environ.get("OPENAI_ORG_ID") is None:
    return
  
  print(f'\nO3 Feedback:')

  text='''
  Most Impactful Fluency Issue
  Issue Type: Hesitation and Repetition
  Example: "We visited, uh, Niagara Falls because, uh, it is very near the, uh, retreat event." 
  Better version: “We visited Niagara Falls because it's very near the retreat event.”
  '''
  # Use O3 to analyze the result.
  model = 'o3'
  systemPrompt = '''
  You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking." 

  You will help me about the 4/3/2 training.

  You will response in less than 300 words.

  Please answer in Chinese.
  '''
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": [
          {"type": "text", "text": f"{text}"},
          {"type": "text", "text": "I received feedback from the audio analysis. Is this feedback reasonable, and should this issue be the main focus?"},
      ]}
    ],
    response_format={"type": "text"},
    reasoning_effort="medium",
    stream=True,
  )

  for chunk in response:
      content = chunk.choices[0].delta.content
      if content:
          sys.stdout.write(content)
          sys.stdout.flush()

 o3_thread = threading.Thread(target=o3_feedback, args=())
 o3_thread.start()

 ############################################################
 play_thread.join()
 o3_thread.join()
 print('')
diff --git a/requirements.txt b/requirements.txt
 annotated-types==0.7.0
 anyio==4.9.0
 certifi==2025.4.26
 cffi==1.17.1
 distro==1.9.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.10
 jiter==0.10.0
 numpy==2.3.0
 openai==1.84.0
 pycparser==2.22
 pydantic==2.11.5
 pydantic_core==2.33.2
 python-dotenv==1.1.0
 sniffio==1.3.1
 sounddevice==0.5.2
 soundfile==0.13.1
 tqdm==4.67.1
 typing-inspection==0.4.1
 typing_extensions==4.14.0
	# Required.
	OPENAI_API_KEY=your-key
	# Optional. For O3 feedback, you need to verify your orgnization at https://platform.openai.com/settings/organization/general.
	OPENAI_ORG_ID=your-verified-orgnization
	# Optional.
	OPENAI_PROXY=https://api.openai.com/v1

	'''
	Usage, see README.md
	'''

	from dotenv import load_dotenv, find_dotenv
	_ = load_dotenv(find_dotenv(".env")) # read local .env file

	import os, base64, sys, threading, argparse
	import sounddevice as sd
	import soundfile as sf

	parser = argparse.ArgumentParser(description="User arguments for audio analysis")
	parser.add_argument('--audio', type=str, required=True, help='Your audio clip path, must be in mp3 format.')
	args = parser.parse_args()

	from openai import OpenAI
	client = OpenAI(
	api_key=os.environ.get("OPENAI_API_KEY"),
	base_url=os.environ.get("OPENAI_PROXY"),
	organization=os.environ.get("OPENAI_ORG_ID"),
	)

	if os.environ.get("OPENAI_ORG_ID") is None:
	print('O3 feedback: Ignored, for you did not set the OPENAI_ORG_ID in .env file.')

	############################################################
	print(f'Input: {args.audio}')
	with open(args.audio, "rb") as audio_file:
	audio_bytes = audio_file.read()
	audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")

	model = 'gpt-4o-audio-preview'
	print(f'Model: {model}')

	systemPrompt = '''
	You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking."

	You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency.

	You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency.

	You should only output the Quick Stats, no other information.

	OUTPUT TEMPLATE

	Quick Stats
	• Length (sec): X
	• Words per Minute (≈): X
	• Silent Pauses ≥0.5 s (count): X
	• Fillers (“uh/um/like”) (count): X
	• IELTS Score (4-9): X
	• CELPIP Score (4-12): X
	'''
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": systemPrompt},
	{"role": "user", "content": [
	{"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."},
	{"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }},
	]}
	],
	modalities=["text"],
	temperature=1,
	max_completion_tokens=2048,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)

	text = response.choices[0].message.content
	print(f'\n{text}')

	############################################################
	model = 'gpt-4o-audio-preview'
	systemPrompt = '''
	You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking."

	You will analyze the uploaded English audio clips, examining the content to identify key factors affecting spoken fluency.

	You should go though the audio clip from beginning to end, then find the most important issue that affects the fluency.

	Your feedback is specific and targeted, helping users improve their speaking ability in real communication. You provide only one most important piece of feedback.

	For each of your feedback, you must give what I said as examples, should never give feedback without example. Then you should provide examples for how to improve it.

	Your feedback should focus on fluency, not on accuracy, grammar, or vocabulary.

	You should only output the Most Impactful Fluency Issue, no other information.

	OUTPUT TEMPLATE

	Most Impactful Fluency Issue
	Issue Type: “…”
	Example: “…”
	Better version: “…”
	'''
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": systemPrompt},
	{"role": "user", "content": [
	{"type": "text", "text": "This is an audio clip for analysis. Please provide feedback on the spoken fluency."},
	{"type": "input_audio", "input_audio": { "data": audio_base64, "format": "mp3" }},
	]}
	],
	modalities=["text", "audio"],
	audio= {
	"voice": "alloy",
	"format": "wav"
	},
	temperature=1,
	max_completion_tokens=2048,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)

	text = response.choices[0].message.audio.transcript
	print(f'\n{text}')

	audio_data = response.choices[0].message.audio.data
	audio_bytes = base64.b64decode(audio_data)

	with open("response.wav", "wb") as out_file:
	out_file.write(audio_bytes)

	def play_audio(file_path):
	data, fs = sf.read(file_path, dtype='float32')
	sd.play(data, fs)
	sd.wait()

	play_thread = threading.Thread(target=play_audio, args=('./response.wav',))
	play_thread.start()

	############################################################
	def o3_feedback():
	if os.environ.get("OPENAI_ORG_ID") is None:
	return

	print(f'\nO3 Feedback:')

	text='''
	Most Impactful Fluency Issue
	Issue Type: Hesitation and Repetition
	Example: "We visited, uh, Niagara Falls because, uh, it is very near the, uh, retreat event."
	Better version: “We visited Niagara Falls because it's very near the retreat event.”
	'''
	# Use O3 to analyze the result.
	model = 'o3'
	systemPrompt = '''
	You are an IELTS English fluency coach using the 4/3/2 exercise method proposed by Paul Nation in "Teaching ESL/EFL Listening and Speaking."

	You will help me about the 4/3/2 training.

	You will response in less than 300 words.

	Please answer in Chinese.
	'''
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": systemPrompt},
	{"role": "user", "content": [
	{"type": "text", "text": f"{text}"},
	{"type": "text", "text": "I received feedback from the audio analysis. Is this feedback reasonable, and should this issue be the main focus?"},
	]}
	],
	response_format={"type": "text"},
	reasoning_effort="medium",
	stream=True,
	)

	for chunk in response:
	content = chunk.choices[0].delta.content
	if content:
	sys.stdout.write(content)
	sys.stdout.flush()

	o3_thread = threading.Thread(target=o3_feedback, args=())
	o3_thread.start()

	############################################################
	play_thread.join()
	o3_thread.join()
	print('')
	annotated-types==0.7.0
	anyio==4.9.0
	certifi==2025.4.26
	cffi==1.17.1
	distro==1.9.0
	h11==0.16.0
	httpcore==1.0.9
	httpx==0.28.1
	idna==3.10
	jiter==0.10.0
	numpy==2.3.0
	openai==1.84.0
	pycparser==2.22
	pydantic==2.11.5
	pydantic_core==2.33.2
	python-dotenv==1.1.0
	sniffio==1.3.1
	sounddevice==0.5.2
	soundfile==0.13.1
	tqdm==4.67.1
	typing-inspection==0.4.1
	typing_extensions==4.14.0