Created
December 6, 2013 21:56
-
-
Save DevJohnC/7832790 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // Author: John Carruthers (johnc@frag-labs.com) | |
| // | |
| // Copyright (C) 2013 John Carruthers | |
| // | |
| // Permission is hereby granted, free of charge, to any person obtaining | |
| // a copy of this software and associated documentation files (the | |
| // "Software"), to deal in the Software without restriction, including | |
| // without limitation the rights to use, copy, modify, merge, publish, | |
| // distribute, sublicense, and/or sell copies of the Software, and to | |
| // permit persons to whom the Software is furnished to do so, subject to | |
| // the following conditions: | |
| // | |
| // The above copyright notice and this permission notice shall be | |
| // included in all copies or substantial portions of the Software. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
| // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |
| // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
| // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
| // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| // | |
| using System; | |
| using FragLabs.Aural; | |
| using FragLabs.Aural.IO; | |
| using FragLabs.Aural.Processing; | |
| using FragLabs.Aural.Processing.Filters; | |
| using FragLabs.Aural.Encoding; | |
| using FragLabs.Aural.Encoding.Opus; | |
| using FragLabs.Adjutant.Network; | |
| using FragLabs.AdjutantOS.API; | |
| using FragLabs.AdjutantOS.API.Log; | |
| using FragLabs.AdjutantOS.API.Network; | |
| namespace FragLabs.AdjutantOS.Client.Audio | |
| { | |
| /// <summary> | |
| /// Listens for audio input and sends to the server for speech recognition. | |
| /// </summary> | |
| public class SpeechRecognitionListener | |
| { | |
| private const int DefaultSampleRate = 8000; // default to 8kHz | |
| private double _dbScale = (double)20 / System.Math.Log(10); | |
| private readonly IAudioInput _audioInput; | |
| private readonly int _samplerate; | |
| private readonly int _channels; | |
| private readonly int _bitdepth; | |
| private readonly float _dbLevel; | |
| private readonly TimeSpan _quiteTimeout; | |
| private readonly string _name; | |
| private readonly FFT _fft; | |
| private readonly uint _fftSize; | |
| private int _readPos = 0; | |
| private AudioInputState _state = AudioInputState.Waiting; | |
| private DateTime _whenWentQuiet = DateTime.MinValue; | |
| private Guid _streamId; | |
| private OpusEncoder _encoder; | |
| private byte[] _encodeBuffer; | |
| /// <summary> | |
| /// Listen on all available audio inputs. | |
| /// </summary> | |
| public static SpeechRecognitionListener[] ListenAll(float initDbLevel, float continueDbLevel, TimeSpan quiteTimeout) | |
| { | |
| try | |
| { | |
| var devices = OpenALInput.GetInputDevices(); | |
| var ret = new SpeechRecognitionListener[devices.Length]; | |
| var i = 0; | |
| foreach (var device in devices) | |
| { | |
| try | |
| { | |
| Logger.Write(LogLevel.Debug, "Opening {0} for speech recognition", device); | |
| var downsample = (OpenALInput.IsOpen(device) && OpenALInput.OpenFormat(device).SampleRate != DefaultSampleRate); | |
| IAudioInput input = OpenALInput.OpenDevice(device, new AudioFormat{ BitDepth = 16, Channels = 1, SampleRate = DefaultSampleRate }, 4800); | |
| if(downsample) | |
| { | |
| input = new BasicDownSampler(input, input.Format.SampleRate, DefaultSampleRate); | |
| } | |
| ret[i++] = | |
| new SpeechRecognitionListener(input, DefaultSampleRate, 1, 16, initDbLevel, quiteTimeout, device); | |
| } | |
| catch (Exception ex) | |
| { | |
| Logger.Write(LogLevel.Debug, "Failed opening {0} for speech recognition", device); | |
| } | |
| } | |
| return ret; | |
| } | |
| catch (Exception ex) | |
| { | |
| return new SpeechRecognitionListener[0]; | |
| } | |
| } | |
| public SpeechRecognitionListener(IAudioInput audioInput, int samplerate, int channels, int bitdepth, | |
| float dbLevel, TimeSpan quiteTimeout, string name) | |
| { | |
| if (audioInput == null) throw new ArgumentNullException("audioInput"); | |
| _audioInput = audioInput; | |
| _samplerate = samplerate; | |
| _channels = channels; | |
| _bitdepth = bitdepth; | |
| _dbLevel = dbLevel; | |
| _quiteTimeout = quiteTimeout; | |
| _name = name; | |
| _encoder = new OpusEncoder(samplerate, channels, Application.Voip); | |
| _audioInput.AudioReceived += HandleAudioReceived; | |
| var logN = LogN(512); | |
| _fftSize = (uint)1 << (int)logN; | |
| _fft = new FFT(); | |
| _fft.init(logN); | |
| var encoderFrameSize = _encoder.PermittedFrameSizes[2]; // 10ms of audio | |
| _encodeBuffer = new byte[_encoder.FrameSizeInBytes(encoderFrameSize)]; // encode buffer is large enough to hold the entire non-encoded PCM frame | |
| StartRead(encoderFrameSize); | |
| } | |
| private uint LogN(uint input) | |
| { | |
| return (uint)(Math.Log(input) / Math.Log(2)); | |
| } | |
| private void StartRead(int sampleCount) | |
| { | |
| _audioInput.StartReading(sampleCount); | |
| } | |
| private void HandleAudioReceived(object sender, AudioReceivedEventArgs e) | |
| { | |
| var samples = Convert16BitToDouble(e.Buffer); | |
| var real = new double[samples.Length]; | |
| var imaginary = new double[samples.Length]; | |
| Buffer.BlockCopy(samples, 0, real, 0, samples.Length); | |
| _fft.run(real, imaginary); | |
| var magnitude = new double[_fftSize / 2]; | |
| double db = 0; | |
| var frequencies = new double[_fftSize / 2]; | |
| for (var i = 0; i < _fftSize / 2; i++) | |
| { | |
| frequencies[i] = i * (_samplerate / _fftSize); | |
| double re = real[i]; // get the Real FFT Number at position i | |
| double im = imaginary[i]; // get the Imaginary FFT Number at position i | |
| magnitude[i] = Math.Sqrt(re * re + im * im); // Convert magnitude to decibels | |
| magnitude[i] = _dbScale * Math.Log(magnitude[i] + double.Epsilon); | |
| if (magnitude[i] > db) | |
| db = magnitude[i]; | |
| } | |
| switch (_state) | |
| { | |
| // if above db level start recording | |
| case AudioInputState.Waiting: | |
| if (db >= _dbLevel) | |
| { | |
| StartRecording(); | |
| } | |
| break; | |
| // if the db level is too low set quiet state and timestamp | |
| case AudioInputState.Recording: | |
| if (db < _dbLevel) | |
| { | |
| GoneQuite(); | |
| } | |
| break; | |
| // if above db level reset recording state, if timed out stop recording | |
| case AudioInputState.Quite: | |
| if (db >= _dbLevel) | |
| { | |
| _state = AudioInputState.Recording; | |
| } | |
| else if (DateTime.Now - _whenWentQuiet > _quiteTimeout) | |
| { | |
| StopRecording(); | |
| } | |
| break; | |
| } | |
| if ((_state == AudioInputState.Recording || _state == AudioInputState.Quite) && DeviceRepository.Server != null) | |
| { | |
| DeviceRepository.Server.Client.Send(new Message | |
| { | |
| IsFromServer = false, | |
| IsResponse = false, | |
| Words = new[] | |
| { | |
| new [] { (byte)Command.SpeechRecognizerPacket }, | |
| _streamId.ToByteArray(), | |
| Encode(e.Buffer, 0, e.SampleCount) | |
| } | |
| }, true); | |
| } | |
| } | |
| /// <summary> | |
| /// Encodes PCM audio buffer. | |
| /// </summary> | |
| /// <param name="srcPcm">Source pcm.</param> | |
| private byte[] Encode(byte[] srcPcm, int offset, int sampleCount) | |
| { | |
| var encodeLen = _encoder.Encode(srcPcm, offset, _encodeBuffer, 0, sampleCount); | |
| var ret = new byte[encodeLen]; | |
| Buffer.BlockCopy(_encodeBuffer, 0, ret, 0, encodeLen); | |
| return ret; | |
| } | |
| private void GoneQuite() | |
| { | |
| _state = AudioInputState.Quite; | |
| _whenWentQuiet = DateTime.Now; | |
| } | |
| private void StartRecording() | |
| { | |
| if (DeviceRepository.Server != null) | |
| { | |
| _streamId = Guid.NewGuid(); | |
| _state = AudioInputState.Recording; | |
| Logger.Write(LogLevel.Debug, "Opening speech recognition stream from {0}", _name); | |
| DeviceRepository.Server.Client.Send(new Message | |
| { | |
| IsFromServer = false, | |
| IsResponse = false, | |
| Words = new[] | |
| { | |
| new [] { (byte)Command.SpeechRecognizerStart }, | |
| _streamId.ToByteArray(), | |
| BitConverter.GetBytes(_samplerate), | |
| BitConverter.GetBytes((short)_channels), | |
| BitConverter.GetBytes((short)_bitdepth) | |
| } | |
| }, true); | |
| } | |
| } | |
| private void StopRecording() | |
| { | |
| Logger.Write(LogLevel.Debug, "Closing speech recognition stream from {0}", _name); | |
| if (DeviceRepository.Server != null) | |
| { | |
| DeviceRepository.Server.Client.Send(new Message | |
| { | |
| IsFromServer = false, | |
| IsResponse = false, | |
| Words = new[] | |
| { | |
| new [] { (byte)Command.SpeechRecognizerEnd }, | |
| _streamId.ToByteArray() | |
| } | |
| }, true); | |
| } | |
| _streamId = Guid.Empty; | |
| _state = AudioInputState.Waiting; | |
| } | |
| private double[] Convert16BitToDouble(byte[] input) | |
| { | |
| var inputSamples = input.Length / (_bitdepth / 8); | |
| var output = new double[inputSamples]; | |
| var outputIndex = 0; | |
| for (var n = 0; n < inputSamples; n++) | |
| { | |
| short sample = BitConverter.ToInt16(input, n * 2); | |
| output[outputIndex++] = sample / 32768f; | |
| } | |
| return output; | |
| } | |
| } | |
| internal enum AudioInputState | |
| { | |
| Waiting, | |
| Recording, | |
| Quite | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment