Last active
March 30, 2025 18:46
-
-
Save RickStrahl/9b250c8bff67edd26b79e614b16955eb to your computer and use it in GitHub Desktop.
Windows Media Speech Recognition/Dictation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Linq; | |
using System.Threading.Tasks; | |
using Windows.Media.SpeechRecognition; | |
using System.Windows.Controls; | |
using System.Windows.Input; | |
using Westwind.Utilities; | |
using MarkdownMonster.Windows; | |
using Windows.Globalization; | |
using System.Reflection; | |
using Windows.Foundation; | |
using System.Globalization; | |
using YamlDotNet.Core.Tokens; | |
using Key = System.Windows.Input.Key; | |
namespace MarkdownMonster.Utilities | |
{ | |
/// <summary> | |
/// Windows.Media Speech Recognition wrapper for speech dictation. | |
/// </summary> | |
public class VoiceDictation | |
{ | |
private readonly SpeechRecognizer _recognizer; | |
private bool _isCompiled; | |
private bool _isDisposed; | |
/// <summary> | |
/// Keep track of recording status | |
/// </summary> | |
public bool IsDictating { get; private set; } | |
public Key StartDictationHotKey { get; set; } = Key.F4; | |
public VoiceDictation() | |
{ | |
if (string.IsNullOrEmpty(mmApp.Configuration.VoiceDictationLanguage)) | |
_recognizer = new SpeechRecognizer(); | |
else | |
_recognizer = new SpeechRecognizer(new Language(mmApp.Configuration.VoiceDictationLanguage)); | |
var dictation = new SpeechRecognitionTopicConstraint(SpeechRecognitionScenario.Dictation, "dictation"); | |
_recognizer.Constraints.Add(dictation); | |
_recognizer.ContinuousRecognitionSession.ResultGenerated += ContinuousRecognitionSession_ResultGenerated; | |
_recognizer.ContinuousRecognitionSession.Completed += ContinuousRecognitionSession_Completed; | |
_recognizer.ContinuousRecognitionSession.AutoStopSilenceTimeout = TimeSpan.FromMinutes(1); | |
// Hook up hot keys for start and stop operations | |
Keyboard.AddKeyDownHandler(mmApp.Window, KeydownHandler); | |
GetStartDictationHotkey(); | |
} | |
private void GetStartDictationHotkey() | |
{ | |
var startKey = mmApp.Window.KeyBindings.KeyBindings | |
.Where(b => b.Id == "StartDictation") | |
.Select(b => b.Key) | |
.FirstOrDefault(); | |
if (startKey == null) return; | |
try | |
{ | |
var k = new KeyConverter(); | |
StartDictationHotKey = (Key)k.ConvertFromString(startKey); | |
} | |
catch { } | |
} | |
private void KeydownHandler(object sender, KeyEventArgs e) | |
{ | |
if (!IsDictating && e.Key == StartDictationHotKey) | |
{ | |
StartAsync().FireAndForget(); | |
Debug.WriteLine("StartAsync fired..."); | |
} | |
else if (IsDictating && e.Key == Key.Escape) | |
{ | |
Stop(); | |
Debug.WriteLine("StopAsync fired."); | |
} | |
} | |
#region Start and Stop Listening | |
/// <summary> | |
/// Starts the speech recognition session. If the recognizer is not idle, it does nothing. | |
/// Ensures the recognizer is compiled and starts continuous recognition. | |
/// </summary> | |
public async Task StartAsync(DictationListenModes listenMode = DictationListenModes.EscPressed) | |
{ | |
if (IsDictating) | |
return; | |
try | |
{ | |
await EnsureCompiledAsync(); | |
var action = _recognizer.ContinuousRecognitionSession.StartAsync(); // Runtime .AsTask() doesn't work here | |
await AsTask(action); | |
IsDictating = true; | |
mmApp.Window.Dispatcher.Invoke(() => | |
{ | |
mmApp.Window.ShowStatusProgress($"Listening for dictation ({_recognizer.CurrentLanguage.AbbreviatedName}). ESC to stop."); | |
}); | |
} | |
catch (Exception ex) when (ex.Message.Contains("privacy")) | |
{ | |
IsDictating = false; | |
// Open the settings page for speech recognition | |
ShellUtils.GoUrl("ms-settings:privacy-speech"); | |
} | |
} | |
/// <summary> | |
/// Stops the speech recognition session if it is currently running. | |
/// </summary> | |
public void Stop() | |
{ | |
mmApp.Window.Dispatcher.Delay(220, async () => | |
{ | |
if (_recognizer.State != SpeechRecognizerState.Idle) | |
{ | |
try | |
{ | |
var action = _recognizer.ContinuousRecognitionSession.StopAsync(); // .AsTask() doesn't work here | |
await AsTask(action); | |
} | |
catch (Exception ex) | |
{ | |
IsDictating = false; | |
mmApp.Window.ShowStatusError($"Couldn't stop dictation engine properly: {ex.Message}"); | |
mmApp.Log("couldn't stop dictation engine properly", ex); | |
return; | |
} | |
} | |
// always | |
IsDictating = false; | |
mmApp.Window.ShowStatusSuccess("Stopped listening for dictation."); | |
}); | |
} | |
#endregion | |
#region Result Handlers and Events | |
/// <summary> | |
/// Event handler for when speech is successfully recognized. | |
/// Simulates keyboard input based on the recognized text. | |
/// </summary> | |
private async void ContinuousRecognitionSession_ResultGenerated(SpeechContinuousRecognitionSession sender, SpeechContinuousRecognitionResultGeneratedEventArgs args) | |
{ | |
if (args.Result.Status != SpeechRecognitionResultStatus.Success) | |
return; | |
if (!mmApp.Configuration.EnableVoiceDictation) | |
return; | |
var text = args.Result?.Text; | |
if (string.IsNullOrEmpty(text)) | |
return; | |
await mmApp.Window.Dispatcher.InvokeAsync(async () => | |
{ | |
var ctrl = Keyboard.FocusedElement; | |
if (ctrl != null) | |
{ | |
if (ctrl is TextBox tb) | |
{ | |
int caretIndex = tb.SelectionStart; | |
tb.Text = tb.Text.Insert(caretIndex, text); | |
tb.SelectionStart = tb.SelectionStart + text.Length; | |
tb.SelectionLength = 0; | |
return; | |
} | |
return; | |
} | |
var editor = mmApp.Model?.ActiveEditor; | |
if (editor == null) | |
return; | |
if (text.Trim().Equals("stop recording", StringComparison.OrdinalIgnoreCase)) | |
{ | |
Stop(); | |
return; | |
} | |
text = await FixUpDictatedText(text, editor); | |
await editor?.SetSelectionAndFocus(text); | |
}); | |
} | |
/// <summary> | |
/// Captures timeouts or other operations where dictation is completed but for | |
/// some reason dictation was not stopped. | |
/// </summary> | |
/// <param name="sender"></param> | |
/// <param name="args"></param> | |
private void ContinuousRecognitionSession_Completed(SpeechContinuousRecognitionSession sender, SpeechContinuousRecognitionCompletedEventArgs args) | |
{ | |
if (IsDictating) | |
{ | |
Stop(); | |
} | |
} | |
private async Task<string> FixUpDictatedText(string text, MarkdownDocumentEditor editor) | |
{ | |
var lineText = await editor.GetCurrentLine() ?? string.Empty; | |
var selPoint = await editor.GetSelectionRange(); | |
lineText = lineText.Substring(0, selPoint.StartColumn); | |
string trimLine = lineText.TrimEnd(); | |
if (text.Length > 1 && (text.EndsWith('.') || text.EndsWith('?') || text.EndsWith('!') || text.EndsWith(','))) | |
{ | |
text += " "; | |
} | |
if (text.Trim().Equals("space", StringComparison.OrdinalIgnoreCase) && !lineText.EndsWith(' ')) | |
text = " "; | |
if (text.Trim().Equals("line break", StringComparison.OrdinalIgnoreCase) || | |
text.Trim().Equals("return", StringComparison.OrdinalIgnoreCase)) | |
text = "\n"; | |
// Capitalize the first character of the text if the last character of the current line is a sentence terminator | |
var firstChar = text[0]; | |
if (string.IsNullOrWhiteSpace(trimLine)) | |
{ | |
/* Beginning of the line: do nothing. */ | |
} | |
else if (text == "." || text == "?" || text == "!") | |
{ | |
return text; | |
} | |
else if (trimLine.EndsWith('.') || trimLine.EndsWith('!') || trimLine.EndsWith('?')) | |
{ | |
text = firstChar.ToString().ToUpper() + text.Substring(1); | |
} | |
else | |
{ | |
text = firstChar.ToString().ToLower() + text.Substring(1); | |
} | |
// Add a space if the last character of the current line is not a space | |
if (!string.IsNullOrWhiteSpace(lineText) && !lineText.EndsWith(' ')) | |
{ | |
text = " " + text; | |
} | |
return text; | |
} | |
/// <summary> | |
/// Ensures that the speech recognizer's constraints are compiled before starting recognition. | |
/// This is necessary to prepare the recognizer for accurate speech recognition. | |
/// </summary> | |
private async Task EnsureCompiledAsync() | |
{ | |
if (!_isCompiled) | |
{ | |
var action = _recognizer.CompileConstraintsAsync(); | |
//await Task.Delay(100); | |
await AsTask<SpeechRecognitionCompilationResult>(action); | |
_isCompiled = true; | |
} | |
} | |
#endregion | |
MethodInfo _asTaskMethod = null; | |
MethodInfo _asTaskMethodGeneric = null; | |
Type _WindowsRuntimeSystemExtensionsType = null; | |
Task AsTask(object action) | |
{ | |
if (_asTaskMethod == null) | |
{ | |
try | |
{ | |
if (_WindowsRuntimeSystemExtensionsType == null) | |
{ | |
var assemblies = AppDomain.CurrentDomain.GetAssemblies(); | |
var assembly = assemblies | |
.FirstOrDefault(a => a.GetName()?.Name == "Microsoft.Windows.SDK.NET"); | |
var types = assembly.GetTypes(); | |
_WindowsRuntimeSystemExtensionsType = types.FirstOrDefault(t => t.FullName == "System.WindowsRuntimeSystemExtensions"); | |
} | |
_asTaskMethod = _WindowsRuntimeSystemExtensionsType.GetMethod("AsTask", [typeof(IAsyncAction)]); | |
} | |
catch { } | |
} | |
if (_asTaskMethod == null) | |
return Task.Delay(20); | |
var t = _asTaskMethod.Invoke(null, [action]) as Task; | |
if (t == null) | |
return Task.Delay(20); | |
return t; | |
} | |
//await AsTask<SpeechRecognitionCompilationResult>(action); | |
Task AsTask<T>(object action) | |
{ | |
if (_asTaskMethod == null) | |
{ | |
try | |
{ | |
if (_WindowsRuntimeSystemExtensionsType == null) | |
{ | |
var assemblies = AppDomain.CurrentDomain.GetAssemblies(); | |
var assembly = assemblies | |
.FirstOrDefault(a => a.GetName()?.Name == "Microsoft.Windows.SDK.NET"); | |
var types = assembly.GetTypes(); | |
_WindowsRuntimeSystemExtensionsType = types.FirstOrDefault(t => t.FullName == "System.WindowsRuntimeSystemExtensions"); | |
} | |
var method = _WindowsRuntimeSystemExtensionsType.GetMethods() | |
.FirstOrDefault(m => m.Name == "AsTask" && m.IsGenericMethod && m.GetParameters().Length == 1); | |
_asTaskMethodGeneric = method.MakeGenericMethod(typeof(T)); | |
} | |
catch { } | |
} | |
if (_asTaskMethodGeneric == null) | |
return Task.Delay(100); | |
var t = _asTaskMethodGeneric.Invoke(null, [action]) as Task; | |
if (t == null) | |
return Task.Delay(100); | |
return t; | |
} | |
/// <summary> | |
/// Disposes the resources used by the listener. | |
/// Unhooks event handlers and disposes the speech recognizer. | |
/// </summary> | |
public void Dispose() | |
{ | |
if (!_isDisposed) | |
{ | |
Keyboard.RemoveKeyDownHandler(mmApp.Window, KeydownHandler); | |
_recognizer.ContinuousRecognitionSession.ResultGenerated -= ContinuousRecognitionSession_ResultGenerated; | |
_recognizer?.Dispose(); | |
_isDisposed = true; | |
} | |
} | |
} | |
public enum DictationListenModes | |
{ | |
EscPressed, | |
OpenMic | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment