diyism · August 24, 2024 14:36
diff --git a/voice recognition phonemes recognition vosk.txt b/voice recognition phonemes recognition vosk.txt
 ===================================vosk=======================================
 vosk-android-demo:
 https://github.com/alphacep/vosk-android-demo

 trainning:
 https://alphacephei.com/vosk/lm

 chinese model:
 https://github.com/alphacep/vosk-api/issues/318
 all models:
 https://alphacephei.com/vosk/models

 coding by voice:
 https://www.youtube.com/watch?v=Qk1mGbIJx3s

 stream-live-android-audio-to-server:
 https://stackoverflow.com/questions/15349987/stream-live-android-audio-to-server

 ===================================whisper=======================================
 $ git clone --depth 1 https://github.com/ggerganov/whisper.cpp
 $ bash ./models/download-ggml-model.sh base         //  base.en for english, base for other

 $ make
 $ ecasound -f:16,1,16000 -i alsa -o a.wav
 $ ./main -l zh -m ./models/ggml-base.bin -f a.wav

 $ make stream
 $ ./stream -l zh -m ./models/ggml-base.bin -t 8 --step 1000 --length 2000
 中文漢字
 ( 字幕:J Chong )
 絕對一致
 起来,不愿做努力的人们。

 #输出比较慢(延迟3秒出结果) 而且 混乱 (中间 那一行 "( 字幕:J Chong )" 是 没有说话 保持安静 的时候 多出来的)

 ===================================rhubarb=======================================
 #播放声音:
 $ ecasound a.wav

 #列出alsa设备:
 $ alsamixer

 $ git clone --depth 1 https://github.com/DanielSWolf/rhubarb-lip-sync.git
 $ sudo apt install libboost-all-dev
 $ cmake . & make

 #修改代码:
 # $ nano ./rhubarb/src/lib/rhubarbLib.cpp

 #识别音素:
 $ ./rhubarb/rhubarb -r phonetic a.wav
 $ ./rhubarb/rhubarb --consoleLevel Debug  -r phonetic a.wav 2>&1 | grep "##phone"

 =====================================allosaurus===================================
 $ pip install allosaurus
 $ ln -s /usr/local/lib/python3.10/dist-packages/nvidia_cudnn_cu11-8.5.0.96-py3.10-linux-x86_64.egg/nvidia/cudnn/ /usr/local/lib/python3.10/dist-packages/nvidia_cuda_nvrtc_cu11-11.7.99-py3.10-linux-x86_64.egg/nvidia/cudnn/

 #识别音素:
 $ python -m allosaurus.run -i ../nihao_hello_jack.wav
 ij i x ɒ h ɛ ɾ ɒ ij a

 $ python -m allosaurus.run --lang cmn -i ../nihao_hello_jack.wav 
 i x a k l ɤ k a

 ==============================use pipewire to replace pulseaudio======================
 $ sudo apt-cache policy pipewire
 pipewire:
  Installed: 0.3.63-1
  Candidate: 0.3.63-1
  Version table:
 *** 0.3.63-1 500
        500 https://kali.download/kali kali-rolling/main amd64 Packages
 $ sudo apt install pipewire pipewire-pulse wireplumber pulseaudio-utils libspa-0.2-bluetooth
 ref: https://wiki.debian.org/PipeWire
 $ systemctl --user restart pipewire pipewire-pulse
 #switch back to pulseaudio:
 $ sudo apt install pulseaudio-module-bluetooth
 $ systemctl --user --now disable pipewire pipewire-pulse
 $ systemctl --user --now disable pipewire-pulse.socket pipewire.socket
 $ systemctl --user --now enable pulseaudio.service pulseaudio.socket
 $ systemctl --user unmask pulseaudio pulseaudio.socket

 ====================edge-tts====================
 pipx install edge-tts
 edge-playback --voice zh-CN-XiaoxiaoNeural --text "你好啊, 我们都是中国人, 说中文的中国人"
 edge-playback --voice zh-CN-shaanxi-XiaoniNeural --text "你好啊, 我们都是中国人, 说中文的中国人"

 =================baidu paddlepaddle+paddlespeech=================================
 $ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
 $ pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
 $ pip install --user paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple
 $ pip install --upgrade requests
 $ pip install protobuf==3.20.0
 $ pip install paddleaudio==1.0.1
 $ paddlespeech asr --lang zh --input ../qilai_buyuanzuo_zhongwenhanzi.wav
 起来不愿做努力的人们中文汉字起来不愿做奴隶的人们啊
 # 没有标点间断, 识别率 比 whisper高, 但是 延迟 比whisper的3秒高太多, 至少25秒

 =================paddlespeech c++=================================
 $ mkdir wenetspeech
 # wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz
 $ tar -xzvf ~/.paddlespeech/models/conformer_wenetspeech-zh-16k/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz -C wenetspeech
 $ git clone https://github.com/diyism/FastASR.git
 $ ./FastASR/scripts/paddlespeech_convert.py wenetspeech/exp/conformer/checkpoints/wenetspeech.pdparams
 $ sudo apt-get install libfftw3-dev libfftw3-single3
 $ sudo apt-get install libopenblas-dev
 $ cd FastASR/ ; mkdir build; cd build; cmake ..; make
 $ cp ../../wenet_params.bin ../models/paddlespeech_cli/
 $ ./examples/paddlespeech_cli ../models/paddlespeech_cli/ ../../../qilai_buyuanzuo_zhongwenhanzi.wav
 起来不愿出我努力的人们中文汉字起来不愿做奴隶的人们啊
 # c++版确实快, 只要2秒

 =================voice spectrogram=================================
 chrome music lab搞的voice spectrogram(嗓音光谱图)很形象(https://musiclab.chromeexperiments.com/spectrogram/),
 在linux电脑的firefox和chrome里都不明显(可能因蓝牙麦克风),
 但在android手机的chrome里非常明显: 汉语拼音的 声母 和 韵母 竟然是分离的,
 对着手机连续说拼音 ge- te- ke- he- 时的光谱图, 明显分成了8列

 ===========================YonaVox trainning===================
 在手机上用google recorder录音, 文件share到google drive, 然后下载到本地input.mp4, 转换wav后再上传到google drive ColabData文件夹
 ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 8820 -ac 1 output.wav

 更简单的方法是在手机上直接访问colab录音:
 https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
 https://colab.research.google.com/github/facebookresearch/WavAugment/blob/main/examples/python/WavAugment_walkthrough.ipynb

 ===========================从整句wav文件分解为单个拼音wav文件 mfa===================
 wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
 chmod 744 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
 ./Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
 #open new terminal
 conda install -c conda-forge montreal-forced-aligner
 mfa model download acoustic mandarin_mfa
 mfa model download dictionary mandarin_pinyin
 mkdir ./my_corpus
 cp nihao_zhongguo.wav ./my_corpus/
 echo 'ni2 hao3 ren2 men2 zhong1 guo2 han4 zi4' > ./my_corpus/nihao_zhongguo.lab
 mfa validate ./my_corpus mandarin_pinyin
 mfa align ./my_corpus mandarin_pinyin mandarin_mfa ./output
 cat output/nihao_zhongguo.TextGrid

 pip install pydub
 pip install textgrid
 subl split.py

 ######################split.py begin:##################
 import textgrid
 import os
 from pydub import AudioSegment

 # 从 TextGrid 文件中提取拼音和时间信息
 def extract_pinyin_time_info(textgrid_file):
    tg = textgrid.TextGrid.fromFile(textgrid_file)
    pinyin_tier = tg[0]  # 假设拼音信息在第一层
    pinyin_time_info = [(item.mark.strip(), item.minTime, item.maxTime) for item in pinyin_tier]
    return pinyin_time_info

 # 将音频文件按拼音时间信息进行分割
 def split_audio_by_pinyin(wav_file, pinyin_time_info, output_dir):
    audio = AudioSegment.from_wav(wav_file)
    for i, (pinyin, start_time, end_time) in enumerate(pinyin_time_info):
        if pinyin:
            start_ms = start_time * 1000
            end_ms = end_time * 1000
            pinyin_audio = audio[start_ms:end_ms]
            pinyin_audio.export(os.path.join(output_dir, f"{i}-{pinyin}.wav"), format="wav")

 # 主程序
 def main():
    wav_file = './my_corpus/nihao_zhongguo.wav'
    textgrid_file = './output/nihao_zhongguo.TextGrid'
    output_dir = './split_wavs'

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pinyin_time_info = extract_pinyin_time_info(textgrid_file)
    split_audio_by_pinyin(wav_file, pinyin_time_info, output_dir)

 if __name__ == "__main__":
    main()
 ######################split.py end##################

 python split.py
 ecasound split_wavs/1-ni2.wav

 可惜 mfa对于 非常清晰的 语音 也 无法 准确 分割 拼音, 而且 录音文件后段 重复短句 每个错误的地方 具有一致性

 ===========================从整句wav文件分解为单个拼音wav文件 aeneas===================
 sudo apt install espeak libespeak-dev
 pip install aeneas
 python -m aeneas.tools.execute_task ./my_corpus/nihao_zhongguo.wav ./my_corpus/nihao_zhongguo.txt     "task_language=cmn|os_task_file_format=json|is_text_type=plain"  ./nihao_zhongguo.json
 subl aeneas_split.py

 ######################aeneas_split.py begin:##################
 import json
 import os
 from pydub import AudioSegment

 def extract_pinyin_time_info(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    fragments = data['fragments']
    pinyin_time_info = [(f['lines'][0], float(f['begin']), float(f['end'])) for f in fragments]
    return pinyin_time_info

 # 将音频文件按拼音时间信息进行分割
 def split_audio_by_pinyin(wav_file, map_file, output_dir):
    audio = AudioSegment.from_wav(wav_file)
    pinyin_time_info = extract_pinyin_time_info(map_file)
    for i, (pinyin, begin, end) in enumerate(pinyin_time_info):
        if pinyin:
            start_ms = begin * 1000
            end_ms = end * 1000
            pinyin_audio = audio[start_ms:end_ms]
            filename = f"{i}-{pinyin}.wav"
            filepath = os.path.join(output_dir, filename)
            pinyin_audio.export(filepath, format="wav")

 # 主程序
 def main():
    wav_file = './my_corpus/nihao_zhongguo.wav'
    map_file = './nihao_zhongguo.json'
    output_dir = './split_wavs'

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    split_audio_by_pinyin(wav_file, map_file, output_dir)

 if __name__ == "__main__":
    main()
 ######################aeneas_split.py end##################

 aeneas 非常 简单有效, 比 mfa准确得多, 分解每个 汉语语音音节 都是准确的, 只有开头 两个 音节 不准确(第1个文件是环境噪音, 第2个文件是第1和第2音节)

 ===========================从整句wav文件分解为单个拼音wav文件 whisper-timestamped===================
 #似乎不支持python3.11, 只能先切换到3.10:
 update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 pip3 install dtw-python
 pip3 install git+https://github.com/openai/whisper
 pip3 install --upgrade --no-deps --force-reinstall git+https://github.com/linto-ai/whisper-timestamped
 whisper_timestamped --vad True ./my_corpus/nihao_zhongguo.wav --model small --language zh

 非常准确, 输出 为"你好,人們,中國,漢字"

 ============================sherpa onnx keyword spotter microphone=======================
 $ git clone https://github.com/k2-fsa/sherpa-onnx
 $ cd sherpa-onnx
 $ mkdir build
 $ cd build
 $ cmake -DCMAKE_BUILD_TYPE=Release ..
 $ make -j6
 $ cd ..
 $ pip install .
 $ cp ./build/bin/sherpa-onnx-keyword-spotter-microphone ~/miniconda3/bin/
 $ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
 $ tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
 $ cd sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
 $ sherpa-onnx-keyword-spotter-microphone \
    --tokens=tokens.txt \
    --encoder=encoder-epoch-12-avg-2-chunk-16-left-64.onnx \
    --decoder=decoder-epoch-12-avg-2-chunk-16-left-64.onnx \
    --joiner=joiner-epoch-12-avg-2-chunk-16-left-64.onnx \
    --provider=cpu \
    --num-threads=8 \
    --keywords-threshold=0.08 \
    --keywords-file=../keywords.txt 2>&1 | grep start_time
 0:{"start_time":0.00, "keyword": "jiang3", "timestamps": [0.96,
 1:{"start_time":0.00, "keyword": "you3", "timestamps": [1.36, 1.40],
 2:{"start_time":0.00, "keyword": "bei4", "timestamps": [1.80, 1.84],
	===================================vosk=======================================
	vosk-android-demo:
	https://github.com/alphacep/vosk-android-demo

	trainning:
	https://alphacephei.com/vosk/lm

	chinese model:
	https://github.com/alphacep/vosk-api/issues/318
	all models:
	https://alphacephei.com/vosk/models

	coding by voice:
	https://www.youtube.com/watch?v=Qk1mGbIJx3s

	stream-live-android-audio-to-server:
	https://stackoverflow.com/questions/15349987/stream-live-android-audio-to-server

	===================================whisper=======================================
	$ git clone --depth 1 https://github.com/ggerganov/whisper.cpp
	$ bash ./models/download-ggml-model.sh base // base.en for english, base for other

	$ make
	$ ecasound -f:16,1,16000 -i alsa -o a.wav
	$ ./main -l zh -m ./models/ggml-base.bin -f a.wav

	$ make stream
	$ ./stream -l zh -m ./models/ggml-base.bin -t 8 --step 1000 --length 2000
	中文漢字
	( 字幕:J Chong )
	絕對一致
	起来,不愿做努力的人们。

	#输出比较慢(延迟3秒出结果) 而且混乱 (中间那一行 "( 字幕:J Chong )" 是没有说话保持安静的时候多出来的)

	===================================rhubarb=======================================
	#播放声音:
	$ ecasound a.wav

	#列出alsa设备:
	$ alsamixer

	$ git clone --depth 1 https://github.com/DanielSWolf/rhubarb-lip-sync.git
	$ sudo apt install libboost-all-dev
	$ cmake . & make

	#修改代码:
	# $ nano ./rhubarb/src/lib/rhubarbLib.cpp

	#识别音素:
	$ ./rhubarb/rhubarb -r phonetic a.wav
	$ ./rhubarb/rhubarb --consoleLevel Debug -r phonetic a.wav 2>&1 \| grep "##phone"

	=====================================allosaurus===================================
	$ pip install allosaurus
	$ ln -s /usr/local/lib/python3.10/dist-packages/nvidia_cudnn_cu11-8.5.0.96-py3.10-linux-x86_64.egg/nvidia/cudnn/ /usr/local/lib/python3.10/dist-packages/nvidia_cuda_nvrtc_cu11-11.7.99-py3.10-linux-x86_64.egg/nvidia/cudnn/

	#识别音素:
	$ python -m allosaurus.run -i ../nihao_hello_jack.wav
	ij i x ɒ h ɛ ɾ ɒ ij a

	$ python -m allosaurus.run --lang cmn -i ../nihao_hello_jack.wav
	i x a k l ɤ k a

	==============================use pipewire to replace pulseaudio======================
	$ sudo apt-cache policy pipewire
	pipewire:
	Installed: 0.3.63-1
	Candidate: 0.3.63-1
	Version table:
	*** 0.3.63-1 500
	500 https://kali.download/kali kali-rolling/main amd64 Packages
	$ sudo apt install pipewire pipewire-pulse wireplumber pulseaudio-utils libspa-0.2-bluetooth
	ref: https://wiki.debian.org/PipeWire
	$ systemctl --user restart pipewire pipewire-pulse
	#switch back to pulseaudio:
	$ sudo apt install pulseaudio-module-bluetooth
	$ systemctl --user --now disable pipewire pipewire-pulse
	$ systemctl --user --now disable pipewire-pulse.socket pipewire.socket
	$ systemctl --user --now enable pulseaudio.service pulseaudio.socket
	$ systemctl --user unmask pulseaudio pulseaudio.socket

	====================edge-tts====================
	pipx install edge-tts
	edge-playback --voice zh-CN-XiaoxiaoNeural --text "你好啊, 我们都是中国人, 说中文的中国人"
	edge-playback --voice zh-CN-shaanxi-XiaoniNeural --text "你好啊, 我们都是中国人, 说中文的中国人"

	=================baidu paddlepaddle+paddlespeech=================================
	$ pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
	$ pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
	$ pip install --user paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple
	$ pip install --upgrade requests
	$ pip install protobuf==3.20.0
	$ pip install paddleaudio==1.0.1
	$ paddlespeech asr --lang zh --input ../qilai_buyuanzuo_zhongwenhanzi.wav
	起来不愿做努力的人们中文汉字起来不愿做奴隶的人们啊
	# 没有标点间断, 识别率比 whisper高, 但是延迟比whisper的3秒高太多, 至少25秒

	=================paddlespeech c++=================================
	$ mkdir wenetspeech
	# wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz
	$ tar -xzvf ~/.paddlespeech/models/conformer_wenetspeech-zh-16k/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz -C wenetspeech
	$ git clone https://github.com/diyism/FastASR.git
	$ ./FastASR/scripts/paddlespeech_convert.py wenetspeech/exp/conformer/checkpoints/wenetspeech.pdparams
	$ sudo apt-get install libfftw3-dev libfftw3-single3
	$ sudo apt-get install libopenblas-dev
	$ cd FastASR/ ; mkdir build; cd build; cmake ..; make
	$ cp ../../wenet_params.bin ../models/paddlespeech_cli/
	$ ./examples/paddlespeech_cli ../models/paddlespeech_cli/ ../../../qilai_buyuanzuo_zhongwenhanzi.wav
	起来不愿出我努力的人们中文汉字起来不愿做奴隶的人们啊
	# c++版确实快, 只要2秒

	=================voice spectrogram=================================
	chrome music lab搞的voice spectrogram(嗓音光谱图)很形象(https://musiclab.chromeexperiments.com/spectrogram/),
	在linux电脑的firefox和chrome里都不明显(可能因蓝牙麦克风),
	但在android手机的chrome里非常明显: 汉语拼音的声母和韵母竟然是分离的,
	对着手机连续说拼音 ge- te- ke- he- 时的光谱图, 明显分成了8列

	===========================YonaVox trainning===================
	在手机上用google recorder录音, 文件share到google drive, 然后下载到本地input.mp4, 转换wav后再上传到google drive ColabData文件夹
	ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 8820 -ac 1 output.wav

	更简单的方法是在手机上直接访问colab录音:
	https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
	https://colab.research.google.com/github/facebookresearch/WavAugment/blob/main/examples/python/WavAugment_walkthrough.ipynb

	===========================从整句wav文件分解为单个拼音wav文件 mfa===================
	wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
	chmod 744 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
	./Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
	#open new terminal
	conda install -c conda-forge montreal-forced-aligner
	mfa model download acoustic mandarin_mfa
	mfa model download dictionary mandarin_pinyin
	mkdir ./my_corpus
	cp nihao_zhongguo.wav ./my_corpus/
	echo 'ni2 hao3 ren2 men2 zhong1 guo2 han4 zi4' > ./my_corpus/nihao_zhongguo.lab
	mfa validate ./my_corpus mandarin_pinyin
	mfa align ./my_corpus mandarin_pinyin mandarin_mfa ./output
	cat output/nihao_zhongguo.TextGrid

	pip install pydub
	pip install textgrid
	subl split.py

	######################split.py begin:##################
	import textgrid
	import os
	from pydub import AudioSegment

	# 从 TextGrid 文件中提取拼音和时间信息
	def extract_pinyin_time_info(textgrid_file):
	tg = textgrid.TextGrid.fromFile(textgrid_file)
	pinyin_tier = tg[0] # 假设拼音信息在第一层
	pinyin_time_info = [(item.mark.strip(), item.minTime, item.maxTime) for item in pinyin_tier]
	return pinyin_time_info

	# 将音频文件按拼音时间信息进行分割
	def split_audio_by_pinyin(wav_file, pinyin_time_info, output_dir):
	audio = AudioSegment.from_wav(wav_file)
	for i, (pinyin, start_time, end_time) in enumerate(pinyin_time_info):
	if pinyin:
	start_ms = start_time * 1000
	end_ms = end_time * 1000
	pinyin_audio = audio[start_ms:end_ms]
	pinyin_audio.export(os.path.join(output_dir, f"{i}-{pinyin}.wav"), format="wav")

	# 主程序
	def main():
	wav_file = './my_corpus/nihao_zhongguo.wav'
	textgrid_file = './output/nihao_zhongguo.TextGrid'
	output_dir = './split_wavs'

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	pinyin_time_info = extract_pinyin_time_info(textgrid_file)
	split_audio_by_pinyin(wav_file, pinyin_time_info, output_dir)

	if __name__ == "__main__":
	main()
	######################split.py end##################

	python split.py
	ecasound split_wavs/1-ni2.wav

	可惜 mfa对于非常清晰的语音也无法准确分割拼音, 而且录音文件后段重复短句每个错误的地方具有一致性

	===========================从整句wav文件分解为单个拼音wav文件 aeneas===================
	sudo apt install espeak libespeak-dev
	pip install aeneas
	python -m aeneas.tools.execute_task ./my_corpus/nihao_zhongguo.wav ./my_corpus/nihao_zhongguo.txt "task_language=cmn\|os_task_file_format=json\|is_text_type=plain" ./nihao_zhongguo.json
	subl aeneas_split.py

	######################aeneas_split.py begin:##################
	import json
	import os
	from pydub import AudioSegment

	def extract_pinyin_time_info(json_file):
	with open(json_file, 'r') as f:
	data = json.load(f)
	fragments = data['fragments']
	pinyin_time_info = [(f['lines'][0], float(f['begin']), float(f['end'])) for f in fragments]
	return pinyin_time_info

	# 将音频文件按拼音时间信息进行分割
	def split_audio_by_pinyin(wav_file, map_file, output_dir):
	audio = AudioSegment.from_wav(wav_file)
	pinyin_time_info = extract_pinyin_time_info(map_file)
	for i, (pinyin, begin, end) in enumerate(pinyin_time_info):
	if pinyin:
	start_ms = begin * 1000
	end_ms = end * 1000
	pinyin_audio = audio[start_ms:end_ms]
	filename = f"{i}-{pinyin}.wav"
	filepath = os.path.join(output_dir, filename)
	pinyin_audio.export(filepath, format="wav")

	# 主程序
	def main():
	wav_file = './my_corpus/nihao_zhongguo.wav'
	map_file = './nihao_zhongguo.json'
	output_dir = './split_wavs'

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	split_audio_by_pinyin(wav_file, map_file, output_dir)

	if __name__ == "__main__":
	main()
	######################aeneas_split.py end##################

	aeneas 非常简单有效, 比 mfa准确得多, 分解每个汉语语音音节都是准确的, 只有开头两个音节不准确(第1个文件是环境噪音, 第2个文件是第1和第2音节)

	===========================从整句wav文件分解为单个拼音wav文件 whisper-timestamped===================
	#似乎不支持python3.11, 只能先切换到3.10:
	update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
	pip3 install dtw-python
	pip3 install git+https://github.com/openai/whisper
	pip3 install --upgrade --no-deps --force-reinstall git+https://github.com/linto-ai/whisper-timestamped
	whisper_timestamped --vad True ./my_corpus/nihao_zhongguo.wav --model small --language zh

	非常准确, 输出为"你好,人們,中國,漢字"

	============================sherpa onnx keyword spotter microphone=======================
	$ git clone https://github.com/k2-fsa/sherpa-onnx
	$ cd sherpa-onnx
	$ mkdir build
	$ cd build
	$ cmake -DCMAKE_BUILD_TYPE=Release ..
	$ make -j6
	$ cd ..
	$ pip install .
	$ cp ./build/bin/sherpa-onnx-keyword-spotter-microphone ~/miniconda3/bin/
	$ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
	$ tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
	$ cd sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
	$ sherpa-onnx-keyword-spotter-microphone \
	--tokens=tokens.txt \
	--encoder=encoder-epoch-12-avg-2-chunk-16-left-64.onnx \
	--decoder=decoder-epoch-12-avg-2-chunk-16-left-64.onnx \
	--joiner=joiner-epoch-12-avg-2-chunk-16-left-64.onnx \
	--provider=cpu \
	--num-threads=8 \
	--keywords-threshold=0.08 \
	--keywords-file=../keywords.txt 2>&1 \| grep start_time
	0:{"start_time":0.00, "keyword": "jiang3", "timestamps": [0.96,
	1:{"start_time":0.00, "keyword": "you3", "timestamps": [1.36, 1.40],
	2:{"start_time":0.00, "keyword": "bei4", "timestamps": [1.80, 1.84],