Skip to content

Instantly share code, notes, and snippets.

@RickStrahl
Last active March 30, 2025 18:46
Show Gist options
  • Save RickStrahl/9b250c8bff67edd26b79e614b16955eb to your computer and use it in GitHub Desktop.
Save RickStrahl/9b250c8bff67edd26b79e614b16955eb to your computer and use it in GitHub Desktop.
Windows Media Speech Recognition/Dictation
using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using Windows.Media.SpeechRecognition;
using System.Windows.Controls;
using System.Windows.Input;
using Westwind.Utilities;
using MarkdownMonster.Windows;
using Windows.Globalization;
using System.Reflection;
using Windows.Foundation;
using System.Globalization;
using YamlDotNet.Core.Tokens;
using Key = System.Windows.Input.Key;
namespace MarkdownMonster.Utilities
{
/// <summary>
/// Windows.Media Speech Recognition wrapper for speech dictation.
/// </summary>
public class VoiceDictation
{
private readonly SpeechRecognizer _recognizer;
private bool _isCompiled;
private bool _isDisposed;
/// <summary>
/// Keep track of recording status
/// </summary>
public bool IsDictating { get; private set; }
public Key StartDictationHotKey { get; set; } = Key.F4;
public VoiceDictation()
{
if (string.IsNullOrEmpty(mmApp.Configuration.VoiceDictationLanguage))
_recognizer = new SpeechRecognizer();
else
_recognizer = new SpeechRecognizer(new Language(mmApp.Configuration.VoiceDictationLanguage));
var dictation = new SpeechRecognitionTopicConstraint(SpeechRecognitionScenario.Dictation, "dictation");
_recognizer.Constraints.Add(dictation);
_recognizer.ContinuousRecognitionSession.ResultGenerated += ContinuousRecognitionSession_ResultGenerated;
_recognizer.ContinuousRecognitionSession.Completed += ContinuousRecognitionSession_Completed;
_recognizer.ContinuousRecognitionSession.AutoStopSilenceTimeout = TimeSpan.FromMinutes(1);
// Hook up hot keys for start and stop operations
Keyboard.AddKeyDownHandler(mmApp.Window, KeydownHandler);
GetStartDictationHotkey();
}
private void GetStartDictationHotkey()
{
var startKey = mmApp.Window.KeyBindings.KeyBindings
.Where(b => b.Id == "StartDictation")
.Select(b => b.Key)
.FirstOrDefault();
if (startKey == null) return;
try
{
var k = new KeyConverter();
StartDictationHotKey = (Key)k.ConvertFromString(startKey);
}
catch { }
}
private void KeydownHandler(object sender, KeyEventArgs e)
{
if (!IsDictating && e.Key == StartDictationHotKey)
{
StartAsync().FireAndForget();
Debug.WriteLine("StartAsync fired...");
}
else if (IsDictating && e.Key == Key.Escape)
{
Stop();
Debug.WriteLine("StopAsync fired.");
}
}
#region Start and Stop Listening
/// <summary>
/// Starts the speech recognition session. If the recognizer is not idle, it does nothing.
/// Ensures the recognizer is compiled and starts continuous recognition.
/// </summary>
public async Task StartAsync(DictationListenModes listenMode = DictationListenModes.EscPressed)
{
if (IsDictating)
return;
try
{
await EnsureCompiledAsync();
var action = _recognizer.ContinuousRecognitionSession.StartAsync(); // Runtime .AsTask() doesn't work here
await AsTask(action);
IsDictating = true;
mmApp.Window.Dispatcher.Invoke(() =>
{
mmApp.Window.ShowStatusProgress($"Listening for dictation ({_recognizer.CurrentLanguage.AbbreviatedName}). ESC to stop.");
});
}
catch (Exception ex) when (ex.Message.Contains("privacy"))
{
IsDictating = false;
// Open the settings page for speech recognition
ShellUtils.GoUrl("ms-settings:privacy-speech");
}
}
/// <summary>
/// Stops the speech recognition session if it is currently running.
/// </summary>
public void Stop()
{
mmApp.Window.Dispatcher.Delay(220, async () =>
{
if (_recognizer.State != SpeechRecognizerState.Idle)
{
try
{
var action = _recognizer.ContinuousRecognitionSession.StopAsync(); // .AsTask() doesn't work here
await AsTask(action);
}
catch (Exception ex)
{
IsDictating = false;
mmApp.Window.ShowStatusError($"Couldn't stop dictation engine properly: {ex.Message}");
mmApp.Log("couldn't stop dictation engine properly", ex);
return;
}
}
// always
IsDictating = false;
mmApp.Window.ShowStatusSuccess("Stopped listening for dictation.");
});
}
#endregion
#region Result Handlers and Events
/// <summary>
/// Event handler for when speech is successfully recognized.
/// Simulates keyboard input based on the recognized text.
/// </summary>
private async void ContinuousRecognitionSession_ResultGenerated(SpeechContinuousRecognitionSession sender, SpeechContinuousRecognitionResultGeneratedEventArgs args)
{
if (args.Result.Status != SpeechRecognitionResultStatus.Success)
return;
if (!mmApp.Configuration.EnableVoiceDictation)
return;
var text = args.Result?.Text;
if (string.IsNullOrEmpty(text))
return;
await mmApp.Window.Dispatcher.InvokeAsync(async () =>
{
var ctrl = Keyboard.FocusedElement;
if (ctrl != null)
{
if (ctrl is TextBox tb)
{
int caretIndex = tb.SelectionStart;
tb.Text = tb.Text.Insert(caretIndex, text);
tb.SelectionStart = tb.SelectionStart + text.Length;
tb.SelectionLength = 0;
return;
}
return;
}
var editor = mmApp.Model?.ActiveEditor;
if (editor == null)
return;
if (text.Trim().Equals("stop recording", StringComparison.OrdinalIgnoreCase))
{
Stop();
return;
}
text = await FixUpDictatedText(text, editor);
await editor?.SetSelectionAndFocus(text);
});
}
/// <summary>
/// Captures timeouts or other operations where dictation is completed but for
/// some reason dictation was not stopped.
/// </summary>
/// <param name="sender"></param>
/// <param name="args"></param>
private void ContinuousRecognitionSession_Completed(SpeechContinuousRecognitionSession sender, SpeechContinuousRecognitionCompletedEventArgs args)
{
if (IsDictating)
{
Stop();
}
}
private async Task<string> FixUpDictatedText(string text, MarkdownDocumentEditor editor)
{
var lineText = await editor.GetCurrentLine() ?? string.Empty;
var selPoint = await editor.GetSelectionRange();
lineText = lineText.Substring(0, selPoint.StartColumn);
string trimLine = lineText.TrimEnd();
if (text.Length > 1 && (text.EndsWith('.') || text.EndsWith('?') || text.EndsWith('!') || text.EndsWith(',')))
{
text += " ";
}
if (text.Trim().Equals("space", StringComparison.OrdinalIgnoreCase) && !lineText.EndsWith(' '))
text = " ";
if (text.Trim().Equals("line break", StringComparison.OrdinalIgnoreCase) ||
text.Trim().Equals("return", StringComparison.OrdinalIgnoreCase))
text = "\n";
// Capitalize the first character of the text if the last character of the current line is a sentence terminator
var firstChar = text[0];
if (string.IsNullOrWhiteSpace(trimLine))
{
/* Beginning of the line: do nothing. */
}
else if (text == "." || text == "?" || text == "!")
{
return text;
}
else if (trimLine.EndsWith('.') || trimLine.EndsWith('!') || trimLine.EndsWith('?'))
{
text = firstChar.ToString().ToUpper() + text.Substring(1);
}
else
{
text = firstChar.ToString().ToLower() + text.Substring(1);
}
// Add a space if the last character of the current line is not a space
if (!string.IsNullOrWhiteSpace(lineText) && !lineText.EndsWith(' '))
{
text = " " + text;
}
return text;
}
/// <summary>
/// Ensures that the speech recognizer's constraints are compiled before starting recognition.
/// This is necessary to prepare the recognizer for accurate speech recognition.
/// </summary>
private async Task EnsureCompiledAsync()
{
if (!_isCompiled)
{
var action = _recognizer.CompileConstraintsAsync();
//await Task.Delay(100);
await AsTask<SpeechRecognitionCompilationResult>(action);
_isCompiled = true;
}
}
#endregion
MethodInfo _asTaskMethod = null;
MethodInfo _asTaskMethodGeneric = null;
Type _WindowsRuntimeSystemExtensionsType = null;
Task AsTask(object action)
{
if (_asTaskMethod == null)
{
try
{
if (_WindowsRuntimeSystemExtensionsType == null)
{
var assemblies = AppDomain.CurrentDomain.GetAssemblies();
var assembly = assemblies
.FirstOrDefault(a => a.GetName()?.Name == "Microsoft.Windows.SDK.NET");
var types = assembly.GetTypes();
_WindowsRuntimeSystemExtensionsType = types.FirstOrDefault(t => t.FullName == "System.WindowsRuntimeSystemExtensions");
}
_asTaskMethod = _WindowsRuntimeSystemExtensionsType.GetMethod("AsTask", [typeof(IAsyncAction)]);
}
catch { }
}
if (_asTaskMethod == null)
return Task.Delay(20);
var t = _asTaskMethod.Invoke(null, [action]) as Task;
if (t == null)
return Task.Delay(20);
return t;
}
//await AsTask<SpeechRecognitionCompilationResult>(action);
Task AsTask<T>(object action)
{
if (_asTaskMethod == null)
{
try
{
if (_WindowsRuntimeSystemExtensionsType == null)
{
var assemblies = AppDomain.CurrentDomain.GetAssemblies();
var assembly = assemblies
.FirstOrDefault(a => a.GetName()?.Name == "Microsoft.Windows.SDK.NET");
var types = assembly.GetTypes();
_WindowsRuntimeSystemExtensionsType = types.FirstOrDefault(t => t.FullName == "System.WindowsRuntimeSystemExtensions");
}
var method = _WindowsRuntimeSystemExtensionsType.GetMethods()
.FirstOrDefault(m => m.Name == "AsTask" && m.IsGenericMethod && m.GetParameters().Length == 1);
_asTaskMethodGeneric = method.MakeGenericMethod(typeof(T));
}
catch { }
}
if (_asTaskMethodGeneric == null)
return Task.Delay(100);
var t = _asTaskMethodGeneric.Invoke(null, [action]) as Task;
if (t == null)
return Task.Delay(100);
return t;
}
/// <summary>
/// Disposes the resources used by the listener.
/// Unhooks event handlers and disposes the speech recognizer.
/// </summary>
public void Dispose()
{
if (!_isDisposed)
{
Keyboard.RemoveKeyDownHandler(mmApp.Window, KeydownHandler);
_recognizer.ContinuousRecognitionSession.ResultGenerated -= ContinuousRecognitionSession_ResultGenerated;
_recognizer?.Dispose();
_isDisposed = true;
}
}
}
public enum DictationListenModes
{
EscPressed,
OpenMic
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment