DevJohnC · December 6, 2013 21:56
diff --git a/SpeechRecognitionListener.cs b/SpeechRecognitionListener.cs
 //  
 //  Author: John Carruthers (johnc@frag-labs.com)
 //  
 //  Copyright (C) 2013 John Carruthers
 //  
 //  Permission is hereby granted, free of charge, to any person obtaining
 //  a copy of this software and associated documentation files (the
 //  "Software"), to deal in the Software without restriction, including
 //  without limitation the rights to use, copy, modify, merge, publish,
 //  distribute, sublicense, and/or sell copies of the Software, and to
 //  permit persons to whom the Software is furnished to do so, subject to
 //  the following conditions:
 //   
 //  The above copyright notice and this permission notice shall be
 //  included in all copies or substantial portions of the Software.
 //   
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 //  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 //  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 //  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 //  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 //  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 //  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 //  

 using System;
 using FragLabs.Aural;
 using FragLabs.Aural.IO;
 using FragLabs.Aural.Processing;
 using FragLabs.Aural.Processing.Filters;
 using FragLabs.Aural.Encoding;
 using FragLabs.Aural.Encoding.Opus;
 using FragLabs.Adjutant.Network;
 using FragLabs.AdjutantOS.API;
 using FragLabs.AdjutantOS.API.Log;
 using FragLabs.AdjutantOS.API.Network;

 namespace FragLabs.AdjutantOS.Client.Audio
 {
    /// <summary>
    /// Listens for audio input and sends to the server for speech recognition.
    /// </summary>
    public class SpeechRecognitionListener
    {
 		private const int DefaultSampleRate = 8000; //  default to 8kHz

        private double _dbScale = (double)20 / System.Math.Log(10);

 		private readonly IAudioInput _audioInput;
        private readonly int _samplerate;
        private readonly int _channels;
        private readonly int _bitdepth;
        private readonly float _dbLevel;
        private readonly TimeSpan _quiteTimeout;
        private readonly string _name;
        private readonly FFT _fft;
        private readonly uint _fftSize;
        private int _readPos = 0;
        private AudioInputState _state = AudioInputState.Waiting;
        private DateTime _whenWentQuiet = DateTime.MinValue;
        private Guid _streamId;
 		private OpusEncoder _encoder;
 		private byte[] _encodeBuffer;

        /// <summary>
        /// Listen on all available audio inputs.
        /// </summary>
        public static SpeechRecognitionListener[] ListenAll(float initDbLevel, float continueDbLevel, TimeSpan quiteTimeout)
        {
            try
            {
 				var devices = OpenALInput.GetInputDevices();
                var ret = new SpeechRecognitionListener[devices.Length];
                var i = 0;
                foreach (var device in devices)
 				{
                    try
                    {
                        Logger.Write(LogLevel.Debug, "Opening {0} for speech recognition", device);
 						var downsample = (OpenALInput.IsOpen(device) && OpenALInput.OpenFormat(device).SampleRate != DefaultSampleRate);
 						IAudioInput input = OpenALInput.OpenDevice(device, new AudioFormat{ BitDepth = 16, Channels = 1, SampleRate = DefaultSampleRate }, 4800);
 						if(downsample)
 						{
 							input = new BasicDownSampler(input, input.Format.SampleRate, DefaultSampleRate);
 						}
                        ret[i++] =
 							new SpeechRecognitionListener(input, DefaultSampleRate, 1, 16, initDbLevel, quiteTimeout, device);
                    }
                    catch (Exception ex)
                    {
                        Logger.Write(LogLevel.Debug, "Failed opening {0} for speech recognition", device);
                    }
                }
                return ret;
            }
            catch (Exception ex)
            {
                return new SpeechRecognitionListener[0];
            }
        }

 		public SpeechRecognitionListener(IAudioInput audioInput, int samplerate, int channels, int bitdepth,
            float dbLevel, TimeSpan quiteTimeout, string name)
        {
 			if (audioInput == null) throw new ArgumentNullException("audioInput");

 			_audioInput = audioInput;
            _samplerate = samplerate;
            _channels = channels;
            _bitdepth = bitdepth;
            _dbLevel = dbLevel;
            _quiteTimeout = quiteTimeout;
            _name = name;

 			_encoder = new OpusEncoder(samplerate, channels, Application.Voip);

 			_audioInput.AudioReceived += HandleAudioReceived;

            var logN = LogN(512);
            _fftSize = (uint)1 << (int)logN;
            _fft = new FFT();
            _fft.init(logN);

 			var encoderFrameSize = _encoder.PermittedFrameSizes[2]; //  10ms of audio
 			_encodeBuffer = new byte[_encoder.FrameSizeInBytes(encoderFrameSize)]; //  encode buffer is large enough to hold the entire non-encoded PCM frame
 			StartRead(encoderFrameSize);
        }

        private uint LogN(uint input)
        {
            return (uint)(Math.Log(input) / Math.Log(2));
        }

 		private void StartRead(int sampleCount)
        {
 			_audioInput.StartReading(sampleCount);
        }

 		private void HandleAudioReceived(object sender, AudioReceivedEventArgs e)
        {
 			var samples = Convert16BitToDouble(e.Buffer);
            var real = new double[samples.Length];
            var imaginary = new double[samples.Length];
            Buffer.BlockCopy(samples, 0, real, 0, samples.Length);
            _fft.run(real, imaginary);

            var magnitude = new double[_fftSize / 2];
            double db = 0;
            var frequencies = new double[_fftSize / 2];
            for (var i = 0; i < _fftSize / 2; i++)
            {
                frequencies[i] = i * (_samplerate / _fftSize);

                double re = real[i]; // get the Real FFT Number at position i
                double im = imaginary[i]; // get the Imaginary FFT Number at position i

                magnitude[i] = Math.Sqrt(re * re + im * im); // Convert magnitude to decibels

                magnitude[i] = _dbScale * Math.Log(magnitude[i] + double.Epsilon);
                if (magnitude[i] > db)
                    db = magnitude[i];
            }

            switch (_state)
            {
                //  if above db level start recording
                case AudioInputState.Waiting:
                    if (db >= _dbLevel)
                    {
                        StartRecording();
                    }
                    break;
                //  if the db level is too low set quiet state and timestamp
                case AudioInputState.Recording:
                    if (db < _dbLevel)
                    {
                        GoneQuite();
                    }
                    break;
                //  if above db level reset recording state, if timed out stop recording
                case AudioInputState.Quite:
                    if (db >= _dbLevel)
                    {
                        _state = AudioInputState.Recording;
                    }
                    else if (DateTime.Now - _whenWentQuiet > _quiteTimeout)
                    {
                        StopRecording();
                    }
                    break;
            }

            if ((_state == AudioInputState.Recording || _state == AudioInputState.Quite) && DeviceRepository.Server != null)
            {
                DeviceRepository.Server.Client.Send(new Message
                {
                    IsFromServer = false,
                    IsResponse = false,
                    Words = new[]
                            {
                                new [] { (byte)Command.SpeechRecognizerPacket },
                                _streamId.ToByteArray(),
 								Encode(e.Buffer, 0, e.SampleCount)
                            }
 				}, true);
            }            
        }

 		/// <summary>
 		/// Encodes PCM audio buffer.
 		/// </summary>
 		/// <param name="srcPcm">Source pcm.</param>
 		private byte[] Encode(byte[] srcPcm, int offset, int sampleCount)
 		{
 			var encodeLen = _encoder.Encode(srcPcm, offset, _encodeBuffer, 0, sampleCount);
 			var ret = new byte[encodeLen];
 			Buffer.BlockCopy(_encodeBuffer, 0, ret, 0, encodeLen);
 			return ret;
 		}

        private void GoneQuite()
        {
            _state = AudioInputState.Quite;
            _whenWentQuiet = DateTime.Now;
        }

        private void StartRecording()
        {
            if (DeviceRepository.Server != null)
            {
                _streamId = Guid.NewGuid();
                _state = AudioInputState.Recording;
                Logger.Write(LogLevel.Debug, "Opening speech recognition stream from {0}", _name);
                DeviceRepository.Server.Client.Send(new Message
                    {
                        IsFromServer = false,
                        IsResponse = false,
                        Words = new[]
                            {
                                new [] { (byte)Command.SpeechRecognizerStart },
                                _streamId.ToByteArray(),
                                BitConverter.GetBytes(_samplerate),
                                BitConverter.GetBytes((short)_channels),
                                BitConverter.GetBytes((short)_bitdepth)
                            }
 					}, true);
            }
        }

        private void StopRecording()
        {
 			Logger.Write(LogLevel.Debug, "Closing speech recognition stream from {0}", _name);
            if (DeviceRepository.Server != null)
            {
                DeviceRepository.Server.Client.Send(new Message
                {
                    IsFromServer = false,
                    IsResponse = false,
                    Words = new[]
                            {
                                new [] { (byte)Command.SpeechRecognizerEnd },
                                _streamId.ToByteArray()
                            }
 				}, true);
            }
            _streamId = Guid.Empty;
            _state = AudioInputState.Waiting;
        }

        private double[] Convert16BitToDouble(byte[] input)
        {
            var inputSamples = input.Length / (_bitdepth / 8);
            var output = new double[inputSamples];
            var outputIndex = 0;
            for (var n = 0; n < inputSamples; n++)
            {
                short sample = BitConverter.ToInt16(input, n * 2);
                output[outputIndex++] = sample / 32768f;
            }
            return output;
        }
    }

    internal enum AudioInputState
    {
        Waiting,
        Recording,
        Quite
    }
 }
	//
	// Author: John Carruthers (johnc@frag-labs.com)
	//
	// Copyright (C) 2013 John Carruthers
	//
	// Permission is hereby granted, free of charge, to any person obtaining
	// a copy of this software and associated documentation files (the
	// "Software"), to deal in the Software without restriction, including
	// without limitation the rights to use, copy, modify, merge, publish,
	// distribute, sublicense, and/or sell copies of the Software, and to
	// permit persons to whom the Software is furnished to do so, subject to
	// the following conditions:
	//
	// The above copyright notice and this permission notice shall be
	// included in all copies or substantial portions of the Software.
	//
	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
	// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
	// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
	// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	//

	using System;
	using FragLabs.Aural;
	using FragLabs.Aural.IO;
	using FragLabs.Aural.Processing;
	using FragLabs.Aural.Processing.Filters;
	using FragLabs.Aural.Encoding;
	using FragLabs.Aural.Encoding.Opus;
	using FragLabs.Adjutant.Network;
	using FragLabs.AdjutantOS.API;
	using FragLabs.AdjutantOS.API.Log;
	using FragLabs.AdjutantOS.API.Network;

	namespace FragLabs.AdjutantOS.Client.Audio
	{
	/// <summary>
	/// Listens for audio input and sends to the server for speech recognition.
	/// </summary>
	public class SpeechRecognitionListener
	{
	private const int DefaultSampleRate = 8000; // default to 8kHz

	private double _dbScale = (double)20 / System.Math.Log(10);

	private readonly IAudioInput _audioInput;
	private readonly int _samplerate;
	private readonly int _channels;
	private readonly int _bitdepth;
	private readonly float _dbLevel;
	private readonly TimeSpan _quiteTimeout;
	private readonly string _name;
	private readonly FFT _fft;
	private readonly uint _fftSize;
	private int _readPos = 0;
	private AudioInputState _state = AudioInputState.Waiting;
	private DateTime _whenWentQuiet = DateTime.MinValue;
	private Guid _streamId;
	private OpusEncoder _encoder;
	private byte[] _encodeBuffer;

	/// <summary>
	/// Listen on all available audio inputs.
	/// </summary>
	public static SpeechRecognitionListener[] ListenAll(float initDbLevel, float continueDbLevel, TimeSpan quiteTimeout)
	{
	try
	{
	var devices = OpenALInput.GetInputDevices();
	var ret = new SpeechRecognitionListener[devices.Length];
	var i = 0;
	foreach (var device in devices)
	{
	try
	{
	Logger.Write(LogLevel.Debug, "Opening {0} for speech recognition", device);
	var downsample = (OpenALInput.IsOpen(device) && OpenALInput.OpenFormat(device).SampleRate != DefaultSampleRate);
	IAudioInput input = OpenALInput.OpenDevice(device, new AudioFormat{ BitDepth = 16, Channels = 1, SampleRate = DefaultSampleRate }, 4800);
	if(downsample)
	{
	input = new BasicDownSampler(input, input.Format.SampleRate, DefaultSampleRate);
	}
	ret[i++] =
	new SpeechRecognitionListener(input, DefaultSampleRate, 1, 16, initDbLevel, quiteTimeout, device);
	}
	catch (Exception ex)
	{
	Logger.Write(LogLevel.Debug, "Failed opening {0} for speech recognition", device);
	}
	}
	return ret;
	}
	catch (Exception ex)
	{
	return new SpeechRecognitionListener[0];
	}
	}

	public SpeechRecognitionListener(IAudioInput audioInput, int samplerate, int channels, int bitdepth,
	float dbLevel, TimeSpan quiteTimeout, string name)
	{
	if (audioInput == null) throw new ArgumentNullException("audioInput");

	_audioInput = audioInput;
	_samplerate = samplerate;
	_channels = channels;
	_bitdepth = bitdepth;
	_dbLevel = dbLevel;
	_quiteTimeout = quiteTimeout;
	_name = name;

	_encoder = new OpusEncoder(samplerate, channels, Application.Voip);

	_audioInput.AudioReceived += HandleAudioReceived;

	var logN = LogN(512);
	_fftSize = (uint)1 << (int)logN;
	_fft = new FFT();
	_fft.init(logN);

	var encoderFrameSize = _encoder.PermittedFrameSizes[2]; // 10ms of audio
	_encodeBuffer = new byte[_encoder.FrameSizeInBytes(encoderFrameSize)]; // encode buffer is large enough to hold the entire non-encoded PCM frame
	StartRead(encoderFrameSize);
	}

	private uint LogN(uint input)
	{
	return (uint)(Math.Log(input) / Math.Log(2));
	}

	private void StartRead(int sampleCount)
	{
	_audioInput.StartReading(sampleCount);
	}

	private void HandleAudioReceived(object sender, AudioReceivedEventArgs e)
	{
	var samples = Convert16BitToDouble(e.Buffer);
	var real = new double[samples.Length];
	var imaginary = new double[samples.Length];
	Buffer.BlockCopy(samples, 0, real, 0, samples.Length);
	_fft.run(real, imaginary);

	var magnitude = new double[_fftSize / 2];
	double db = 0;
	var frequencies = new double[_fftSize / 2];
	for (var i = 0; i < _fftSize / 2; i++)
	{
	frequencies[i] = i * (_samplerate / _fftSize);

	double re = real[i]; // get the Real FFT Number at position i
	double im = imaginary[i]; // get the Imaginary FFT Number at position i

	magnitude[i] = Math.Sqrt(re * re + im * im); // Convert magnitude to decibels

	magnitude[i] = _dbScale * Math.Log(magnitude[i] + double.Epsilon);
	if (magnitude[i] > db)
	db = magnitude[i];
	}

	switch (_state)
	{
	// if above db level start recording
	case AudioInputState.Waiting:
	if (db >= _dbLevel)
	{
	StartRecording();
	}
	break;
	// if the db level is too low set quiet state and timestamp
	case AudioInputState.Recording:
	if (db < _dbLevel)
	{
	GoneQuite();
	}
	break;
	// if above db level reset recording state, if timed out stop recording
	case AudioInputState.Quite:
	if (db >= _dbLevel)
	{
	_state = AudioInputState.Recording;
	}
	else if (DateTime.Now - _whenWentQuiet > _quiteTimeout)
	{
	StopRecording();
	}
	break;
	}

	if ((_state == AudioInputState.Recording \|\| _state == AudioInputState.Quite) && DeviceRepository.Server != null)
	{
	DeviceRepository.Server.Client.Send(new Message
	{
	IsFromServer = false,
	IsResponse = false,
	Words = new[]
	{
	new [] { (byte)Command.SpeechRecognizerPacket },
	_streamId.ToByteArray(),
	Encode(e.Buffer, 0, e.SampleCount)
	}
	}, true);
	}
	}

	/// <summary>
	/// Encodes PCM audio buffer.
	/// </summary>
	/// <param name="srcPcm">Source pcm.</param>
	private byte[] Encode(byte[] srcPcm, int offset, int sampleCount)
	{
	var encodeLen = _encoder.Encode(srcPcm, offset, _encodeBuffer, 0, sampleCount);
	var ret = new byte[encodeLen];
	Buffer.BlockCopy(_encodeBuffer, 0, ret, 0, encodeLen);
	return ret;
	}

	private void GoneQuite()
	{
	_state = AudioInputState.Quite;
	_whenWentQuiet = DateTime.Now;
	}

	private void StartRecording()
	{
	if (DeviceRepository.Server != null)
	{
	_streamId = Guid.NewGuid();
	_state = AudioInputState.Recording;
	Logger.Write(LogLevel.Debug, "Opening speech recognition stream from {0}", _name);
	DeviceRepository.Server.Client.Send(new Message
	{
	IsFromServer = false,
	IsResponse = false,
	Words = new[]
	{
	new [] { (byte)Command.SpeechRecognizerStart },
	_streamId.ToByteArray(),
	BitConverter.GetBytes(_samplerate),
	BitConverter.GetBytes((short)_channels),
	BitConverter.GetBytes((short)_bitdepth)
	}
	}, true);
	}
	}

	private void StopRecording()
	{
	Logger.Write(LogLevel.Debug, "Closing speech recognition stream from {0}", _name);
	if (DeviceRepository.Server != null)
	{
	DeviceRepository.Server.Client.Send(new Message
	{
	IsFromServer = false,
	IsResponse = false,
	Words = new[]
	{
	new [] { (byte)Command.SpeechRecognizerEnd },
	_streamId.ToByteArray()
	}
	}, true);
	}
	_streamId = Guid.Empty;
	_state = AudioInputState.Waiting;
	}

	private double[] Convert16BitToDouble(byte[] input)
	{
	var inputSamples = input.Length / (_bitdepth / 8);
	var output = new double[inputSamples];
	var outputIndex = 0;
	for (var n = 0; n < inputSamples; n++)
	{
	short sample = BitConverter.ToInt16(input, n * 2);
	output[outputIndex++] = sample / 32768f;
	}
	return output;
	}
	}

	internal enum AudioInputState
	{
	Waiting,
	Recording,
	Quite
	}
	}
No results found