Skip to content

Instantly share code, notes, and snippets.

@MattJeanes
Last active March 30, 2020 16:18
Show Gist options
  • Save MattJeanes/75e57b38c44a1260f1e692edd4c5e6f5 to your computer and use it in GitHub Desktop.
Save MattJeanes/75e57b38c44a1260f1e692edd4c5e6f5 to your computer and use it in GitHub Desktop.
$commaNumberRegex = [regex]'([0-9][0-9\,]+[0-9])'
$decimalNumberRegex = [regex]'([0-9]+\.[0-9]+)'
$poundsRegex = [regex]'£([0-9\,]*[0-9]+)'
$dollarsRegex = [regex]'\$([0-9\.\,]*[0-9]+)'
$ordinalRegex = [regex]'([0-9]+)(st|nd|rd|th)'
$numberRegex = [regex]'[0-9]+'
if (-not (Get-Module -ListAvailable -Name "PSUnidecode")) {
Install-Module "PSUnidecode" -Force
}
Import-Module "PSUnidecode"
function Get-StandardNumberToWords($n, $digit_group) {
$parts = @()
if ($n -ge 1000) {
# Format next higher digit group.
$parts += (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 1000))) ($digit_group + 1))
$n = $n % 1000
}
if ($n -ge 100) {
$parts += ("$($units[[math]::Floor([decimal]($n / 100))]) hundred")
}
if (($n % 100) -ge $units.Count) {
$parts += ($tens[[math]::Floor([decimal](($n % 100) / 10))])
$parts += ($units[($n % 100) % 10])
}
else {
$parts += ($units[$n % 100])
}
if ($n -gt 0) {
$parts += ($digitGroups[$digit_group])
}
$final = ""
$parts | ForEach-Object {
if (-not [string]::IsNullOrWhiteSpace($_)) {
$final += " $_"
}
}
return $final.Trim()
}
function Get-NumberToWords($n) {
# Handle special cases first, then go to the standard case:
if ($n -ge 1000000000000000000) {
return str(n) # Too large, just return the digits
}
if ($n -eq 0) {
return 'zero'
}
if ($n % 100 -eq 0 -and $n % 1000 -ne 0 -and $n -lt 3000) {
return (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 100))) 0) + ' hundred'
}
return Get-StandardNumberToWords $n 0
}
$units = @(
'', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
'seventeen', 'eighteen', 'nineteen'
)
$tens = @(
'',
'ten',
'twenty',
'thirty',
'forty',
'fifty',
'sixty',
'seventy',
'eighty',
'ninety'
)
$digitGroups = @(
'',
'thousand',
'million',
'billion',
'trillion',
'quadrillion'
)
$ordinalSuffixes = @(
@{Suffix = 'one'; Replacement = 'first' },
@{Suffix = 'two'; Replacement = 'second' },
@{Suffix = 'three'; Replacement = 'third' },
@{Suffix = 'five'; Replacement = 'fifth' },
@{Suffix = 'eight'; Replacement = 'eighth' },
@{Suffix = 'nine'; Replacement = 'ninth' },
@{Suffix = 'twelve'; Replacement = 'twelfth' },
@{Suffix = 'ty'; Replacement = 'tieth' }
)
$removeCommas = {
$match = $args[0]
return $match.value.replace(',', '')
}
$expandDecimalPoints = {
$match = $args[0]
return $match.Groups[1].Value.replace('.', ' point ')
}
$expandPounds = {
$match = $args[0]
$parts = $match.Groups[1].Value.split('.')
if ($parts.Count -gt 2) {
return match + ' pounds' # Unexpected format
}
$pounds = if ($parts[0]) { [int]$parts[0] } else { 0 }
$pence = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
if ($pounds -and $pence) {
$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' }
return "$pounds $pound_unit, $pence $pence_unit"
}
if ($pounds) {
$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
return "$pounds $pound_unit"
}
if ($pence) {
$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' }
return "$pence $pence_unit"
}
return 'zero pounds'
}
$expandDollars = {
$match = $args[0]
$parts = $match.Groups[1].Value.split('.')
if ($parts.Count -gt 2) {
return match + ' dollars' # Unexpected format
}
$dollars = if ($parts[0]) { [int]$parts[0] } else { 0 }
$cents = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
if ($dollars -and $cents) {
$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' }
return "$dollars $dollar_unit, $cents $cent_unit"
}
if ($dollars) {
$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
return "$dollars $dollar_unit"
}
if ($cents) {
$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' }
return "$cents $cent_unit"
}
return 'zero dollars'
}
$expandOrdinals = {
$match = $args[0]
$m = $match.Groups[1].Value
$n = [Convert]::ToInt32($m)
$num = Get-NumberToWords($n)
$found = $false
$ordinalSuffixes | ForEach-Object {
$suffix = $_.Suffix
$replacement = $_.Replacement
if ($num.EndsWith($suffix)) {
$found = $true
return $num.Substring(0, $num.Length - $suffix.Length) + $replacement
}
}
if (-not $found) {
return $num + 'th'
}
}
$expandNumber = {
$match = $args[0]
$m = $match.Groups[0].Value
$n = [Convert]::ToInt32($m)
return Get-NumberToWords($n)
}
function Get-NormalizedNumbers($text) {
$text = $commaNumberRegex.Replace($text, $removeCommas)
$text = $poundsRegex.Replace($text, $expandPounds)
$text = $dollarsRegex.Replace($text, $expandDollars)
$text = $decimalNumberRegex.Replace($text, $expandDecimalPoints)
$text = $ordinalRegex.Replace($text, $expandOrdinals)
$text = $numberRegex.Replace($text, $expandNumber)
return $text
}
# List of (regular expression, replacement) pairs for abbreviations:
$abbreviations = @(
@{Regex = 'mrs'; Replacement = 'misess' },
@{Regex = 'mr'; Replacement = 'mister' },
@{Regex = 'dr'; Replacement = 'doctor' },
@{Regex = 'st'; Replacement = 'saint' },
@{Regex = 'co'; Replacement = 'company' },
@{Regex = 'jr'; Replacement = 'junior' },
@{Regex = 'maj'; Replacement = 'major' },
@{Regex = 'gen'; Replacement = 'general' },
@{Regex = 'drs'; Replacement = 'doctors' },
@{Regex = 'rev'; Replacement = 'reverend' },
@{Regex = 'lt'; Replacement = 'lieutenant' },
@{Regex = 'hon'; Replacement = 'honorable' },
@{Regex = 'sgt'; Replacement = 'sergeant' },
@{Regex = 'capt'; Replacement = 'captain' },
@{Regex = 'esq'; Replacement = 'esquire' },
@{Regex = 'ltd'; Replacement = 'limited' },
@{Regex = 'col'; Replacement = 'colonel' },
@{Regex = 'ft'; Replacement = 'fort' }
)
$abbreviations | ForEach-Object {
$_.Regex = [regex]"(?i)\b$($_.Regex)\."
}
function Expand-Abbreviations($text) {
$abbreviations | ForEach-Object {
$regex = $_.Regex
$replacement = $_.Replacement
$text = $regex.Replace($text, $replacement)
}
return $text
}
function Expand-Numbers {
return Get-NormalizedNumbers($text)
}
function ConvertTo-Lowercase($text) {
return $text.ToLower()
}
function Remove-Whitespace($text) {
return $text = ([regex]"\s+").Replace($text.Trim(), ' ')
}
function ConvertTo-Ascii($text) {
return (ConvertFrom-Unicode $text.Replace("£","<<pound sign>>")).Replace("<<pound sign>>", "£")
}
function Remove-AuxSymbols($text) {
$text = ([regex]'[\<\>\(\)\[\]\"]+').Replace($text, "")
return $text
}
function Set-Symbols($text) {
$text = $text.replace(';', ',')
$text = $text.replace('-', ' ')
$text = $text.replace(':', ' ')
$text = $text.replace('&', 'and')
return $text
}
function Invoke-BasicCleaners($text) {
# Basic pipeline that lowercases and collapses whitespace without transliteration.
$text = ConvertTo-Lowercase $text
$text = Remove-Whitespace $text
return $text
}
function Invoke-TransliterationCleaners($text) {
# Pipeline for non-English text that transliterates to ASCII.
$text = ConvertTo-Ascii $text
$text = ConvertTo-Lowercase $text
$text = Remove-Whitespace $text
return $text
}
function Invoke-EnglishCleaners($text) {
# Pipeline for English text, including number and abbreviation expansion.
$text = ConvertTo-Ascii $text
$text = ConvertTo-Lowercase $text
$text = Expand-Numbers $text
$text = Expand-Abbreviations $text
$text = Set-Symbols $text
$text = Remove-AuxSymbols $text
$text = Remove-Whitespace $text
return $text
}
function Invoke-PhonemeCleaners($text) {
# Pipeline for phonemes mode, including number and abbreviation expansion.
$text = ConvertTo-Ascii $text
$text = Expand-Numbers $text
$text = Expand-Abbreviations $text
$text = Set-Symbols $text
$text = Remove-AuxSymbols $text
$text = Remove-Whitespace $text
return $text
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment