Skip to content

Instantly share code, notes, and snippets.

@DevJohnC
Created December 6, 2013 21:56
Show Gist options
  • Select an option

  • Save DevJohnC/7832790 to your computer and use it in GitHub Desktop.

Select an option

Save DevJohnC/7832790 to your computer and use it in GitHub Desktop.
//
// Author: John Carruthers (johnc@frag-labs.com)
//
// Copyright (C) 2013 John Carruthers
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
using System;
using FragLabs.Aural;
using FragLabs.Aural.IO;
using FragLabs.Aural.Processing;
using FragLabs.Aural.Processing.Filters;
using FragLabs.Aural.Encoding;
using FragLabs.Aural.Encoding.Opus;
using FragLabs.Adjutant.Network;
using FragLabs.AdjutantOS.API;
using FragLabs.AdjutantOS.API.Log;
using FragLabs.AdjutantOS.API.Network;
namespace FragLabs.AdjutantOS.Client.Audio
{
/// <summary>
/// Listens for audio input and sends to the server for speech recognition.
/// </summary>
public class SpeechRecognitionListener
{
private const int DefaultSampleRate = 8000; // default to 8kHz
private double _dbScale = (double)20 / System.Math.Log(10);
private readonly IAudioInput _audioInput;
private readonly int _samplerate;
private readonly int _channels;
private readonly int _bitdepth;
private readonly float _dbLevel;
private readonly TimeSpan _quiteTimeout;
private readonly string _name;
private readonly FFT _fft;
private readonly uint _fftSize;
private int _readPos = 0;
private AudioInputState _state = AudioInputState.Waiting;
private DateTime _whenWentQuiet = DateTime.MinValue;
private Guid _streamId;
private OpusEncoder _encoder;
private byte[] _encodeBuffer;
/// <summary>
/// Listen on all available audio inputs.
/// </summary>
public static SpeechRecognitionListener[] ListenAll(float initDbLevel, float continueDbLevel, TimeSpan quiteTimeout)
{
try
{
var devices = OpenALInput.GetInputDevices();
var ret = new SpeechRecognitionListener[devices.Length];
var i = 0;
foreach (var device in devices)
{
try
{
Logger.Write(LogLevel.Debug, "Opening {0} for speech recognition", device);
var downsample = (OpenALInput.IsOpen(device) && OpenALInput.OpenFormat(device).SampleRate != DefaultSampleRate);
IAudioInput input = OpenALInput.OpenDevice(device, new AudioFormat{ BitDepth = 16, Channels = 1, SampleRate = DefaultSampleRate }, 4800);
if(downsample)
{
input = new BasicDownSampler(input, input.Format.SampleRate, DefaultSampleRate);
}
ret[i++] =
new SpeechRecognitionListener(input, DefaultSampleRate, 1, 16, initDbLevel, quiteTimeout, device);
}
catch (Exception ex)
{
Logger.Write(LogLevel.Debug, "Failed opening {0} for speech recognition", device);
}
}
return ret;
}
catch (Exception ex)
{
return new SpeechRecognitionListener[0];
}
}
public SpeechRecognitionListener(IAudioInput audioInput, int samplerate, int channels, int bitdepth,
float dbLevel, TimeSpan quiteTimeout, string name)
{
if (audioInput == null) throw new ArgumentNullException("audioInput");
_audioInput = audioInput;
_samplerate = samplerate;
_channels = channels;
_bitdepth = bitdepth;
_dbLevel = dbLevel;
_quiteTimeout = quiteTimeout;
_name = name;
_encoder = new OpusEncoder(samplerate, channels, Application.Voip);
_audioInput.AudioReceived += HandleAudioReceived;
var logN = LogN(512);
_fftSize = (uint)1 << (int)logN;
_fft = new FFT();
_fft.init(logN);
var encoderFrameSize = _encoder.PermittedFrameSizes[2]; // 10ms of audio
_encodeBuffer = new byte[_encoder.FrameSizeInBytes(encoderFrameSize)]; // encode buffer is large enough to hold the entire non-encoded PCM frame
StartRead(encoderFrameSize);
}
private uint LogN(uint input)
{
return (uint)(Math.Log(input) / Math.Log(2));
}
private void StartRead(int sampleCount)
{
_audioInput.StartReading(sampleCount);
}
private void HandleAudioReceived(object sender, AudioReceivedEventArgs e)
{
var samples = Convert16BitToDouble(e.Buffer);
var real = new double[samples.Length];
var imaginary = new double[samples.Length];
Buffer.BlockCopy(samples, 0, real, 0, samples.Length);
_fft.run(real, imaginary);
var magnitude = new double[_fftSize / 2];
double db = 0;
var frequencies = new double[_fftSize / 2];
for (var i = 0; i < _fftSize / 2; i++)
{
frequencies[i] = i * (_samplerate / _fftSize);
double re = real[i]; // get the Real FFT Number at position i
double im = imaginary[i]; // get the Imaginary FFT Number at position i
magnitude[i] = Math.Sqrt(re * re + im * im); // Convert magnitude to decibels
magnitude[i] = _dbScale * Math.Log(magnitude[i] + double.Epsilon);
if (magnitude[i] > db)
db = magnitude[i];
}
switch (_state)
{
// if above db level start recording
case AudioInputState.Waiting:
if (db >= _dbLevel)
{
StartRecording();
}
break;
// if the db level is too low set quiet state and timestamp
case AudioInputState.Recording:
if (db < _dbLevel)
{
GoneQuite();
}
break;
// if above db level reset recording state, if timed out stop recording
case AudioInputState.Quite:
if (db >= _dbLevel)
{
_state = AudioInputState.Recording;
}
else if (DateTime.Now - _whenWentQuiet > _quiteTimeout)
{
StopRecording();
}
break;
}
if ((_state == AudioInputState.Recording || _state == AudioInputState.Quite) && DeviceRepository.Server != null)
{
DeviceRepository.Server.Client.Send(new Message
{
IsFromServer = false,
IsResponse = false,
Words = new[]
{
new [] { (byte)Command.SpeechRecognizerPacket },
_streamId.ToByteArray(),
Encode(e.Buffer, 0, e.SampleCount)
}
}, true);
}
}
/// <summary>
/// Encodes PCM audio buffer.
/// </summary>
/// <param name="srcPcm">Source pcm.</param>
private byte[] Encode(byte[] srcPcm, int offset, int sampleCount)
{
var encodeLen = _encoder.Encode(srcPcm, offset, _encodeBuffer, 0, sampleCount);
var ret = new byte[encodeLen];
Buffer.BlockCopy(_encodeBuffer, 0, ret, 0, encodeLen);
return ret;
}
private void GoneQuite()
{
_state = AudioInputState.Quite;
_whenWentQuiet = DateTime.Now;
}
private void StartRecording()
{
if (DeviceRepository.Server != null)
{
_streamId = Guid.NewGuid();
_state = AudioInputState.Recording;
Logger.Write(LogLevel.Debug, "Opening speech recognition stream from {0}", _name);
DeviceRepository.Server.Client.Send(new Message
{
IsFromServer = false,
IsResponse = false,
Words = new[]
{
new [] { (byte)Command.SpeechRecognizerStart },
_streamId.ToByteArray(),
BitConverter.GetBytes(_samplerate),
BitConverter.GetBytes((short)_channels),
BitConverter.GetBytes((short)_bitdepth)
}
}, true);
}
}
private void StopRecording()
{
Logger.Write(LogLevel.Debug, "Closing speech recognition stream from {0}", _name);
if (DeviceRepository.Server != null)
{
DeviceRepository.Server.Client.Send(new Message
{
IsFromServer = false,
IsResponse = false,
Words = new[]
{
new [] { (byte)Command.SpeechRecognizerEnd },
_streamId.ToByteArray()
}
}, true);
}
_streamId = Guid.Empty;
_state = AudioInputState.Waiting;
}
private double[] Convert16BitToDouble(byte[] input)
{
var inputSamples = input.Length / (_bitdepth / 8);
var output = new double[inputSamples];
var outputIndex = 0;
for (var n = 0; n < inputSamples; n++)
{
short sample = BitConverter.ToInt16(input, n * 2);
output[outputIndex++] = sample / 32768f;
}
return output;
}
}
internal enum AudioInputState
{
Waiting,
Recording,
Quite
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment