Last active
November 11, 2024 00:56
-
-
Save lselden/ab2e04fbac785e0644c4b562bf5e35cd to your computer and use it in GitHub Desktop.
Powershell script to say text using the WinRT speech synthesis API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Speak text using SSML | |
.DESCRIPTION | |
Speak text using SSML, using built in MS speech synthesis. Will output metadata about result, including any embedded <mark>s | |
.PARAMETER Text | |
(default from pipeline) | |
The Text to speak. Text will automatically be wrapped in <speak> if necessary. | |
.PARAMETER Path | |
Output to disk instead of speakers. This is output filename. Output is always in WAV format (PCM s16le) | |
.PARAMETER Variable | |
Output to a global variable as memory stream. Output is always in WAV format (PCM s16le) | |
.PARAMETER listVoices | |
If -listVoices is passed then this function will just output a list of available | |
voices in the format {languageCode, id, name, ssmlGender} | |
.PARAMETER Voice | |
Name of voice to use. To get list of voices use -listVoices option | |
.PARAMETER Rate | |
Speech rate in range 0.33 - 3 -- 1 = 100% - 2 is twice as fast | |
.PARAMETER Volume | |
Volume in range 0-1, 1 (default) is full volume | |
.PARAMETER SampleRate | |
SampleRate of output WAV file. Default is 24000 | |
.PARAMETER Channels | |
Number of channels of output WAV file. Default is 1 | |
.PARAMETER Lang | |
Language to use. Default is the default voice's language. | |
.PARAMETER SpeechMarkTypes | |
Marks to include in output. Default is sentence,words,ssml. set to "" to not output any marks | |
.INPUTS | |
System.String. You can pipe the "Text" parameter into the script | |
.OUTPUTS | |
Will play sound to speakers by default, or write to disk if -Path is specified. | |
Output is metadata about result: | |
{voice: string, input: string, rate: double, volume: double, duration: int (duration in milliseconds), marks: Array<{time: int (milliseconds), value: string (mark name attribute)}>} | |
.EXAMPLE | |
PS> ./out-ssml-winrt.ps1 "hello world" | |
#> | |
param( | |
[Parameter(ValueFromPipeline = $true)] [string] $text, | |
[Parameter(Mandatory = $false)] [string] $voice, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.33, 3.0)] [double] $rate = 1.0, | |
[Parameter(Mandatory = $false)] | |
[ValidateRange(0.0, 1.0)] [double] $volume = 1.0, | |
[Parameter(Mandatory = $false)] [string] $path, | |
[Parameter(Mandatory = $false)] [string] $variable, | |
[Parameter(Mandatory = $false)] [int] $sampleRate = 24000, | |
[Parameter(Mandatory = $false)] [int] $channels = 1, | |
[Parameter(Mandatory = $false)] [string] $lang, | |
[Parameter(Mandatory = $false)] [string] $speechMarkTypes = "sentence,words,ssml", | |
[Switch] $listVoices | |
) | |
begin { | |
Add-Type -AssemblyName System.Runtime.WindowsRuntime | |
[void][Windows.Foundation.IAsyncOperation`1, Windows.Foundation, ContentType=WindowsRuntime] | |
[void][Windows.Foundation.IAsyncOperationWithProgress`2, Windows.Foundation, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.VoiceInformation, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.SpeechSynthesis.SpeechSynthesisStream, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime] | |
[void][Windows.Media.Core.SpeechCue, Windows.Media.Core, ContentType=WindowsRuntime] | |
[void][Windows.Media.Core.TimedMetadataTrack, Windows.Media.Core, ContentType=WindowsRuntime] | |
$_taskMethods = [System.WindowsRuntimeSystemExtensions].GetMethods() | ? { | |
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 | |
} | |
$asTaskGeneric = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation`1' })[0]; | |
$asTaskGeneric2 = ($_taskMethods | ? { $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperationWithProgress`2' })[0]; | |
Function Await($WinRtTask, $ResultType) { | |
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType) | |
$netTask = $asTask.Invoke($null, @($WinRtTask)) | |
$netTask.Wait(-1) | Out-Null | |
$netTask.Result | |
} | |
Function AwaitWithProgress($WinRtTask, $ResultType1, $ResultType2) { | |
$asTask = $asTaskGeneric2.MakeGenericMethod($ResultType1, $ResultType2) | |
$netTask = $asTask.Invoke($null, @($WinRtTask)) | |
$netTask.Wait(-1) | Out-Null | |
} | |
Function ParseMarkers($timedTextTracks) { | |
$list = @() | |
$timedTextTracks | % { | |
$markType = switch($_.Id) { | |
"SpeechWord" { "word" } | |
"SpeechSentence" { "sentence" } | |
"SpeechViseme" { "viseme" } | |
"SpeechBookmark" { "ssml" } | |
Default { "unknown" } | |
} | |
$_.Cues | % { | |
$payload = if ($_.StartPositionInInput) { | |
[PSCustomObject]@{ | |
type = $markType | |
time = [int]$_.StartTime.TotalMilliseconds | |
value = $_.Text | |
start = $_.StartPositionInInput | |
end = $_.EndPositionInInput | |
}; | |
} else { | |
[PSCustomObject]@{ | |
type = $markType | |
time = [int]$_.StartTime.TotalMilliseconds | |
value = $_.Text | |
}; | |
} | |
if ($payload.value) { | |
$list += $payload; | |
} | |
} | |
} | |
$list | |
} | |
Function PlayWave([System.Byte[]]$bytes) { | |
$memstream = [System.IO.MemoryStream]::new($bytes); | |
$player = [System.Media.SoundPlayer]::new($memstream) | |
$player.PlaySync(); | |
$player.Dispose(); | |
$memstream.Dispose(); | |
} | |
Function SaveWave($path, [System.Byte[]]$bytes) { | |
$filepath = if ([System.IO.Path]::IsPathRooted($path)) { | |
$path; | |
} else { | |
[System.IO.Path]::GetFullPath((join-path $pwd $path)) | |
} | |
[System.IO.File]::WriteAllBytes($filepath, $bytes) | |
} | |
Function WaveToVariable($variable, [System.Byte[]]$bytes) { | |
Set-Variable -Scope 'global' -Name $variable -Value $bytes; | |
} | |
$script:voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices; | |
if (-not $voices.Id) { | |
Write-Debug "Unable to get installed voices list. Script will only use default voice"; | |
$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice); | |
} | |
} | |
process { | |
if ($listVoices) { | |
return $script:voices | % { | |
[PSCustomObject]@{ | |
languageCode = $_.Language | |
id = $_.DisplayName; | |
name = $_.Description; | |
ssmlGender = $_.Gender; | |
} | |
} | |
} | |
if (-not $text) { | |
Write-Error "No text specified"; | |
return; | |
} | |
$speech = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new() | |
$speech.Options.AppendedSilence = [Windows.Media.SpeechSynthesis.SpeechAppendedSilence]::Min | |
if ($voice) { | |
$voiceInfo = $script:voices | Where-Object { $_.DisplayName -imatch $voice } | Select-Object -First 1 | |
if ($voiceInfo) { | |
$speech.Voice = $voiceInfo; | |
} else { | |
Write-Debug "No voice found matching $voice" | |
} | |
} | |
if ($speechMarkTypes -match 'sentence') { | |
$speech.Options.IncludeSentenceBoundaryMetadata = $true; | |
} | |
if ($speechMarkTypes -match 'words') { | |
$speech.Options.IncludeWordBoundaryMetadata = $true; | |
} | |
if ($rate -ne 1.0) { | |
$speech.Options.SpeakingRate = [math]::Clamp($rate, 0.5, 6.0); | |
} | |
if ($volume -ne 1.0) { | |
$speech.AudioVolume = [math]::Clamp($volume, 0.0, 1.0); | |
} | |
$ssmlNamespace = 'http://www.w3.org/2001/10/synthesis'; | |
if (-not $text.Trim().StartsWith('<speak')) { | |
$text = [System.Security.SecurityElement]::Escape($text); | |
$text = "<speak version=`"1.0`">$text</speak>"; | |
} | |
$dom = [xml]$text; | |
$dom.speak.SetAttribute('version', '1.0'); | |
$dom.speak.SetAttribute('xml:lang', $speech.Voice.Language); | |
$dom.speak.SetAttribute('xmlns', $ssmlNamespace); | |
$text = $dom.speak.OuterXml; | |
# actually speak - create data stream | |
try { | |
$stream = Await ($speech.SynthesizeSsmlToStreamAsync($text)) ([Windows.Media.SpeechSynthesis.SpeechSynthesisStream]); | |
} catch { | |
Write-Error "Error creating stream $_"; | |
if ($_.InnerExceptions -and $_.InnerExceptions.Count) { | |
$_.InnerExceptions | % { | |
Write-Error "$($_.GetType().Name), $($_.Message)"; | |
} | |
} | |
return; | |
} | |
if (-not $stream.Size) { | |
# error occurred | |
Write-Error "Error Creating Synthesis Stream - no results" | |
return; | |
} | |
if ($speechMarkTypes -ne '') { | |
$markers = ParseMarkers $stream.TimedMetadataTracks | |
$markers | |
} | |
# create destination buffer | |
$bytes = [array]::CreateInstance([byte], $stream.Size); | |
[Windows.Storage.Streams.IBuffer]$buffer = [System.Runtime.InteropServices.WindowsRuntime.WindowsRuntimeBufferExtensions]::AsBuffer($bytes); | |
# wait for buffer copy | |
AwaitWithProgress ($stream.ReadAsync($buffer, [uint32]$stream.Size, [Windows.Storage.Streams.InputStreamOptions]::None)) ([Windows.Storage.Streams.IBuffer]) ([UInt32]) | |
#write out | |
if ($variable) { | |
WaveToVariable $variable $bytes; | |
} elseif ($path) { | |
SaveWave $path $bytes; | |
} else { | |
PlayWave $bytes | |
} | |
} | |
end { | |
if ($stream) { | |
$stream.Dispose(); | |
} | |
if ($speech) { | |
$speech.Dispose(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment