Last active
March 30, 2020 16:18
-
-
Save MattJeanes/75e57b38c44a1260f1e692edd4c5e6f5 to your computer and use it in GitHub Desktop.
Normalize text, ported from https://github.com/mozilla/TTS/blob/master/utils/text/cleaners.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$commaNumberRegex = [regex]'([0-9][0-9\,]+[0-9])' | |
$decimalNumberRegex = [regex]'([0-9]+\.[0-9]+)' | |
$poundsRegex = [regex]'£([0-9\,]*[0-9]+)' | |
$dollarsRegex = [regex]'\$([0-9\.\,]*[0-9]+)' | |
$ordinalRegex = [regex]'([0-9]+)(st|nd|rd|th)' | |
$numberRegex = [regex]'[0-9]+' | |
if (-not (Get-Module -ListAvailable -Name "PSUnidecode")) { | |
Install-Module "PSUnidecode" -Force | |
} | |
Import-Module "PSUnidecode" | |
function Get-StandardNumberToWords($n, $digit_group) { | |
$parts = @() | |
if ($n -ge 1000) { | |
# Format next higher digit group. | |
$parts += (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 1000))) ($digit_group + 1)) | |
$n = $n % 1000 | |
} | |
if ($n -ge 100) { | |
$parts += ("$($units[[math]::Floor([decimal]($n / 100))]) hundred") | |
} | |
if (($n % 100) -ge $units.Count) { | |
$parts += ($tens[[math]::Floor([decimal](($n % 100) / 10))]) | |
$parts += ($units[($n % 100) % 10]) | |
} | |
else { | |
$parts += ($units[$n % 100]) | |
} | |
if ($n -gt 0) { | |
$parts += ($digitGroups[$digit_group]) | |
} | |
$final = "" | |
$parts | ForEach-Object { | |
if (-not [string]::IsNullOrWhiteSpace($_)) { | |
$final += " $_" | |
} | |
} | |
return $final.Trim() | |
} | |
function Get-NumberToWords($n) { | |
# Handle special cases first, then go to the standard case: | |
if ($n -ge 1000000000000000000) { | |
return str(n) # Too large, just return the digits | |
} | |
if ($n -eq 0) { | |
return 'zero' | |
} | |
if ($n % 100 -eq 0 -and $n % 1000 -ne 0 -and $n -lt 3000) { | |
return (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 100))) 0) + ' hundred' | |
} | |
return Get-StandardNumberToWords $n 0 | |
} | |
$units = @( | |
'', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', | |
'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', | |
'seventeen', 'eighteen', 'nineteen' | |
) | |
$tens = @( | |
'', | |
'ten', | |
'twenty', | |
'thirty', | |
'forty', | |
'fifty', | |
'sixty', | |
'seventy', | |
'eighty', | |
'ninety' | |
) | |
$digitGroups = @( | |
'', | |
'thousand', | |
'million', | |
'billion', | |
'trillion', | |
'quadrillion' | |
) | |
$ordinalSuffixes = @( | |
@{Suffix = 'one'; Replacement = 'first' }, | |
@{Suffix = 'two'; Replacement = 'second' }, | |
@{Suffix = 'three'; Replacement = 'third' }, | |
@{Suffix = 'five'; Replacement = 'fifth' }, | |
@{Suffix = 'eight'; Replacement = 'eighth' }, | |
@{Suffix = 'nine'; Replacement = 'ninth' }, | |
@{Suffix = 'twelve'; Replacement = 'twelfth' }, | |
@{Suffix = 'ty'; Replacement = 'tieth' } | |
) | |
$removeCommas = { | |
$match = $args[0] | |
return $match.value.replace(',', '') | |
} | |
$expandDecimalPoints = { | |
$match = $args[0] | |
return $match.Groups[1].Value.replace('.', ' point ') | |
} | |
$expandPounds = { | |
$match = $args[0] | |
$parts = $match.Groups[1].Value.split('.') | |
if ($parts.Count -gt 2) { | |
return match + ' pounds' # Unexpected format | |
} | |
$pounds = if ($parts[0]) { [int]$parts[0] } else { 0 } | |
$pence = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 } | |
if ($pounds -and $pence) { | |
$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' } | |
$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' } | |
return "$pounds $pound_unit, $pence $pence_unit" | |
} | |
if ($pounds) { | |
$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' } | |
return "$pounds $pound_unit" | |
} | |
if ($pence) { | |
$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' } | |
return "$pence $pence_unit" | |
} | |
return 'zero pounds' | |
} | |
$expandDollars = { | |
$match = $args[0] | |
$parts = $match.Groups[1].Value.split('.') | |
if ($parts.Count -gt 2) { | |
return match + ' dollars' # Unexpected format | |
} | |
$dollars = if ($parts[0]) { [int]$parts[0] } else { 0 } | |
$cents = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 } | |
if ($dollars -and $cents) { | |
$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' } | |
$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' } | |
return "$dollars $dollar_unit, $cents $cent_unit" | |
} | |
if ($dollars) { | |
$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' } | |
return "$dollars $dollar_unit" | |
} | |
if ($cents) { | |
$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' } | |
return "$cents $cent_unit" | |
} | |
return 'zero dollars' | |
} | |
$expandOrdinals = { | |
$match = $args[0] | |
$m = $match.Groups[1].Value | |
$n = [Convert]::ToInt32($m) | |
$num = Get-NumberToWords($n) | |
$found = $false | |
$ordinalSuffixes | ForEach-Object { | |
$suffix = $_.Suffix | |
$replacement = $_.Replacement | |
if ($num.EndsWith($suffix)) { | |
$found = $true | |
return $num.Substring(0, $num.Length - $suffix.Length) + $replacement | |
} | |
} | |
if (-not $found) { | |
return $num + 'th' | |
} | |
} | |
$expandNumber = { | |
$match = $args[0] | |
$m = $match.Groups[0].Value | |
$n = [Convert]::ToInt32($m) | |
return Get-NumberToWords($n) | |
} | |
function Get-NormalizedNumbers($text) { | |
$text = $commaNumberRegex.Replace($text, $removeCommas) | |
$text = $poundsRegex.Replace($text, $expandPounds) | |
$text = $dollarsRegex.Replace($text, $expandDollars) | |
$text = $decimalNumberRegex.Replace($text, $expandDecimalPoints) | |
$text = $ordinalRegex.Replace($text, $expandOrdinals) | |
$text = $numberRegex.Replace($text, $expandNumber) | |
return $text | |
} | |
# List of (regular expression, replacement) pairs for abbreviations: | |
$abbreviations = @( | |
@{Regex = 'mrs'; Replacement = 'misess' }, | |
@{Regex = 'mr'; Replacement = 'mister' }, | |
@{Regex = 'dr'; Replacement = 'doctor' }, | |
@{Regex = 'st'; Replacement = 'saint' }, | |
@{Regex = 'co'; Replacement = 'company' }, | |
@{Regex = 'jr'; Replacement = 'junior' }, | |
@{Regex = 'maj'; Replacement = 'major' }, | |
@{Regex = 'gen'; Replacement = 'general' }, | |
@{Regex = 'drs'; Replacement = 'doctors' }, | |
@{Regex = 'rev'; Replacement = 'reverend' }, | |
@{Regex = 'lt'; Replacement = 'lieutenant' }, | |
@{Regex = 'hon'; Replacement = 'honorable' }, | |
@{Regex = 'sgt'; Replacement = 'sergeant' }, | |
@{Regex = 'capt'; Replacement = 'captain' }, | |
@{Regex = 'esq'; Replacement = 'esquire' }, | |
@{Regex = 'ltd'; Replacement = 'limited' }, | |
@{Regex = 'col'; Replacement = 'colonel' }, | |
@{Regex = 'ft'; Replacement = 'fort' } | |
) | |
$abbreviations | ForEach-Object { | |
$_.Regex = [regex]"(?i)\b$($_.Regex)\." | |
} | |
function Expand-Abbreviations($text) { | |
$abbreviations | ForEach-Object { | |
$regex = $_.Regex | |
$replacement = $_.Replacement | |
$text = $regex.Replace($text, $replacement) | |
} | |
return $text | |
} | |
function Expand-Numbers { | |
return Get-NormalizedNumbers($text) | |
} | |
function ConvertTo-Lowercase($text) { | |
return $text.ToLower() | |
} | |
function Remove-Whitespace($text) { | |
return $text = ([regex]"\s+").Replace($text.Trim(), ' ') | |
} | |
function ConvertTo-Ascii($text) { | |
return (ConvertFrom-Unicode $text.Replace("£","<<pound sign>>")).Replace("<<pound sign>>", "£") | |
} | |
function Remove-AuxSymbols($text) { | |
$text = ([regex]'[\<\>\(\)\[\]\"]+').Replace($text, "") | |
return $text | |
} | |
function Set-Symbols($text) { | |
$text = $text.replace(';', ',') | |
$text = $text.replace('-', ' ') | |
$text = $text.replace(':', ' ') | |
$text = $text.replace('&', 'and') | |
return $text | |
} | |
function Invoke-BasicCleaners($text) { | |
# Basic pipeline that lowercases and collapses whitespace without transliteration. | |
$text = ConvertTo-Lowercase $text | |
$text = Remove-Whitespace $text | |
return $text | |
} | |
function Invoke-TransliterationCleaners($text) { | |
# Pipeline for non-English text that transliterates to ASCII. | |
$text = ConvertTo-Ascii $text | |
$text = ConvertTo-Lowercase $text | |
$text = Remove-Whitespace $text | |
return $text | |
} | |
function Invoke-EnglishCleaners($text) { | |
# Pipeline for English text, including number and abbreviation expansion. | |
$text = ConvertTo-Ascii $text | |
$text = ConvertTo-Lowercase $text | |
$text = Expand-Numbers $text | |
$text = Expand-Abbreviations $text | |
$text = Set-Symbols $text | |
$text = Remove-AuxSymbols $text | |
$text = Remove-Whitespace $text | |
return $text | |
} | |
function Invoke-PhonemeCleaners($text) { | |
# Pipeline for phonemes mode, including number and abbreviation expansion. | |
$text = ConvertTo-Ascii $text | |
$text = Expand-Numbers $text | |
$text = Expand-Abbreviations $text | |
$text = Set-Symbols $text | |
$text = Remove-AuxSymbols $text | |
$text = Remove-Whitespace $text | |
return $text | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment