MattJeanes · March 30, 2020 16:18
diff --git a/normalize.ps1 b/normalize.ps1
 $commaNumberRegex = [regex]'([0-9][0-9\,]+[0-9])'
 $decimalNumberRegex = [regex]'([0-9]+\.[0-9]+)'
 $poundsRegex = [regex]'£([0-9\,]*[0-9]+)'
 $dollarsRegex = [regex]'\$([0-9\.\,]*[0-9]+)'
 $ordinalRegex = [regex]'([0-9]+)(st|nd|rd|th)'
 $numberRegex = [regex]'[0-9]+'

 if (-not (Get-Module -ListAvailable -Name "PSUnidecode")) {
    Install-Module "PSUnidecode" -Force
 }

 Import-Module "PSUnidecode"

 function Get-StandardNumberToWords($n, $digit_group) {
    $parts = @()
    if ($n -ge 1000) {
        # Format next higher digit group.
        $parts += (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 1000))) ($digit_group + 1))
        $n = $n % 1000
    }
    if ($n -ge 100) {
        $parts += ("$($units[[math]::Floor([decimal]($n / 100))]) hundred")
    }
    if (($n % 100) -ge $units.Count) {
        $parts += ($tens[[math]::Floor([decimal](($n % 100) / 10))])
        $parts += ($units[($n % 100) % 10])
    }
    else {
        $parts += ($units[$n % 100])
    }
    if ($n -gt 0) {
        $parts += ($digitGroups[$digit_group])
    }
    $final = ""
    $parts | ForEach-Object {
        if (-not [string]::IsNullOrWhiteSpace($_)) {
            $final += " $_"
        }
    }
    return $final.Trim()
 }


 function Get-NumberToWords($n) {
    # Handle special cases first, then go to the standard case:
    if ($n -ge 1000000000000000000) {
        return str(n)  # Too large, just return the digits
    }
    if ($n -eq 0) {
        return 'zero'
    }
    if ($n % 100 -eq 0 -and $n % 1000 -ne 0 -and $n -lt 3000) {
        return (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 100))) 0) + ' hundred'
    }
    return Get-StandardNumberToWords $n 0
 }

 $units = @(
    '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
    'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
    'seventeen', 'eighteen', 'nineteen'
 )

 $tens = @(
    '',
    'ten',
    'twenty',
    'thirty',
    'forty',
    'fifty',
    'sixty',
    'seventy',
    'eighty',
    'ninety'
 )

 $digitGroups = @(
    '',
    'thousand',
    'million',
    'billion',
    'trillion',
    'quadrillion'
 )

 $ordinalSuffixes = @(
    @{Suffix = 'one'; Replacement = 'first' },
    @{Suffix = 'two'; Replacement = 'second' },
    @{Suffix = 'three'; Replacement = 'third' },
    @{Suffix = 'five'; Replacement = 'fifth' },
    @{Suffix = 'eight'; Replacement = 'eighth' },
    @{Suffix = 'nine'; Replacement = 'ninth' },
    @{Suffix = 'twelve'; Replacement = 'twelfth' },
    @{Suffix = 'ty'; Replacement = 'tieth' }
 )

 $removeCommas = {
    $match = $args[0]
    return $match.value.replace(',', '')
 }

 $expandDecimalPoints = {
    $match = $args[0]
    return $match.Groups[1].Value.replace('.', ' point ')
 }


 $expandPounds = {
    $match = $args[0]
    $parts = $match.Groups[1].Value.split('.')
    if ($parts.Count -gt 2) {
        return match + ' pounds'  # Unexpected format
    }
    $pounds = if ($parts[0]) { [int]$parts[0] } else { 0 }
    $pence = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
    if ($pounds -and $pence) {
        $pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
        $pence_unit = if ($pence -eq 1) { 'penny' }  else { 'pence' }
        return "$pounds $pound_unit, $pence $pence_unit"
    }
    if ($pounds) {
        $pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
        return "$pounds $pound_unit"
    }
    if ($pence) {
        $pence_unit = if ($pence -eq 1) { 'penny' }  else { 'pence' }
        return "$pence $pence_unit"
    }
    return 'zero pounds'
 }


 $expandDollars = {
    $match = $args[0]
    $parts = $match.Groups[1].Value.split('.')
    if ($parts.Count -gt 2) {
        return match + ' dollars'  # Unexpected format
    }
    $dollars = if ($parts[0]) { [int]$parts[0] } else { 0 }
    $cents = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
    if ($dollars -and $cents) {
        $dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
        $cent_unit = if ($cents -eq 1) { 'cent' }  else { 'cents' }
        return "$dollars $dollar_unit, $cents $cent_unit"
    }
    if ($dollars) {
        $dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
        return "$dollars $dollar_unit"
    }
    if ($cents) {
        $cent_unit = if ($cents -eq 1) { 'cent' }  else { 'cents' }
        return "$cents $cent_unit"
    }
    return 'zero dollars'
 }

 $expandOrdinals = {
    $match = $args[0]
    $m = $match.Groups[1].Value
    $n = [Convert]::ToInt32($m)
    $num = Get-NumberToWords($n)
    $found = $false
    $ordinalSuffixes | ForEach-Object {
        $suffix = $_.Suffix
        $replacement = $_.Replacement
        if ($num.EndsWith($suffix)) {
            $found = $true
            return $num.Substring(0, $num.Length - $suffix.Length) + $replacement
        }
    }
    if (-not $found) {
        return $num + 'th'
    }
 }

 $expandNumber = {
    $match = $args[0]
    $m = $match.Groups[0].Value
    $n = [Convert]::ToInt32($m)
    return Get-NumberToWords($n)
 }


 function Get-NormalizedNumbers($text) {
    $text = $commaNumberRegex.Replace($text, $removeCommas)
    $text = $poundsRegex.Replace($text, $expandPounds)
    $text = $dollarsRegex.Replace($text, $expandDollars)
    $text = $decimalNumberRegex.Replace($text, $expandDecimalPoints)
    $text = $ordinalRegex.Replace($text, $expandOrdinals)
    $text = $numberRegex.Replace($text, $expandNumber)
    return $text
 }

 # List of (regular expression, replacement) pairs for abbreviations:
 $abbreviations = @(
    @{Regex = 'mrs'; Replacement = 'misess' },
    @{Regex = 'mr'; Replacement = 'mister' },
    @{Regex = 'dr'; Replacement = 'doctor' },
    @{Regex = 'st'; Replacement = 'saint' },
    @{Regex = 'co'; Replacement = 'company' },
    @{Regex = 'jr'; Replacement = 'junior' },
    @{Regex = 'maj'; Replacement = 'major' },
    @{Regex = 'gen'; Replacement = 'general' },
    @{Regex = 'drs'; Replacement = 'doctors' },
    @{Regex = 'rev'; Replacement = 'reverend' },
    @{Regex = 'lt'; Replacement = 'lieutenant' },
    @{Regex = 'hon'; Replacement = 'honorable' },
    @{Regex = 'sgt'; Replacement = 'sergeant' },
    @{Regex = 'capt'; Replacement = 'captain' },
    @{Regex = 'esq'; Replacement = 'esquire' },
    @{Regex = 'ltd'; Replacement = 'limited' },
    @{Regex = 'col'; Replacement = 'colonel' },
    @{Regex = 'ft'; Replacement = 'fort' }
 )
 $abbreviations | ForEach-Object { 
    $_.Regex = [regex]"(?i)\b$($_.Regex)\."
 }

 function Expand-Abbreviations($text) {
    $abbreviations | ForEach-Object {
        $regex = $_.Regex
        $replacement = $_.Replacement
        $text = $regex.Replace($text, $replacement)
    }
    return $text
 }

 function Expand-Numbers {
    return Get-NormalizedNumbers($text)
 }


 function ConvertTo-Lowercase($text) {
    return $text.ToLower()
 }


 function Remove-Whitespace($text) {
    return $text = ([regex]"\s+").Replace($text.Trim(), ' ')
 }


 function ConvertTo-Ascii($text) {
    return (ConvertFrom-Unicode $text.Replace("£","<<pound sign>>")).Replace("<<pound sign>>", "£")
 }


 function Remove-AuxSymbols($text) {
    $text = ([regex]'[\<\>\(\)\[\]\"]+').Replace($text, "")
    return $text
 }


 function Set-Symbols($text) {
    $text = $text.replace(';', ',')
    $text = $text.replace('-', ' ')
    $text = $text.replace(':', ' ')
    $text = $text.replace('&', 'and')
    return $text
 }

 function Invoke-BasicCleaners($text) {
    # Basic pipeline that lowercases and collapses whitespace without transliteration.
    $text = ConvertTo-Lowercase $text
    $text = Remove-Whitespace $text
    return $text
 }

 function Invoke-TransliterationCleaners($text) {
    # Pipeline for non-English text that transliterates to ASCII.
    $text = ConvertTo-Ascii $text
    $text = ConvertTo-Lowercase $text
    $text = Remove-Whitespace $text
    return $text
 }


 function Invoke-EnglishCleaners($text) {
    # Pipeline for English text, including number and abbreviation expansion.
    $text = ConvertTo-Ascii $text
    $text = ConvertTo-Lowercase $text
    $text = Expand-Numbers $text
    $text = Expand-Abbreviations $text
    $text = Set-Symbols $text
    $text = Remove-AuxSymbols $text
    $text = Remove-Whitespace $text
    return $text
 }


 function Invoke-PhonemeCleaners($text) {
    # Pipeline for phonemes mode, including number and abbreviation expansion.
    $text = ConvertTo-Ascii $text
    $text = Expand-Numbers $text
    $text = Expand-Abbreviations $text
    $text = Set-Symbols $text
    $text = Remove-AuxSymbols $text
    $text = Remove-Whitespace $text
    return $text
 }
	$commaNumberRegex = [regex]'([0-9][0-9\,]+[0-9])'
	$decimalNumberRegex = [regex]'([0-9]+\.[0-9]+)'
	$poundsRegex = [regex]'£([0-9\,]*[0-9]+)'
	$dollarsRegex = [regex]'\$([0-9\.\,]*[0-9]+)'
	$ordinalRegex = [regex]'([0-9]+)(st\|nd\|rd\|th)'
	$numberRegex = [regex]'[0-9]+'

	if (-not (Get-Module -ListAvailable -Name "PSUnidecode")) {
	Install-Module "PSUnidecode" -Force
	}

	Import-Module "PSUnidecode"

	function Get-StandardNumberToWords($n, $digit_group) {
	$parts = @()
	if ($n -ge 1000) {
	# Format next higher digit group.
	$parts += (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 1000))) ($digit_group + 1))
	$n = $n % 1000
	}
	if ($n -ge 100) {
	$parts += ("$($units[[math]::Floor([decimal]($n / 100))]) hundred")
	}
	if (($n % 100) -ge $units.Count) {
	$parts += ($tens[[math]::Floor([decimal](($n % 100) / 10))])
	$parts += ($units[($n % 100) % 10])
	}
	else {
	$parts += ($units[$n % 100])
	}
	if ($n -gt 0) {
	$parts += ($digitGroups[$digit_group])
	}
	$final = ""
	$parts \| ForEach-Object {
	if (-not [string]::IsNullOrWhiteSpace($_)) {
	$final += " $_"
	}
	}
	return $final.Trim()
	}


	function Get-NumberToWords($n) {
	# Handle special cases first, then go to the standard case:
	if ($n -ge 1000000000000000000) {
	return str(n) # Too large, just return the digits
	}
	if ($n -eq 0) {
	return 'zero'
	}
	if ($n % 100 -eq 0 -and $n % 1000 -ne 0 -and $n -lt 3000) {
	return (Get-StandardNumberToWords ([math]::Floor([decimal]($n / 100))) 0) + ' hundred'
	}
	return Get-StandardNumberToWords $n 0
	}

	$units = @(
	'', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
	'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
	'seventeen', 'eighteen', 'nineteen'
	)

	$tens = @(
	'',
	'ten',
	'twenty',
	'thirty',
	'forty',
	'fifty',
	'sixty',
	'seventy',
	'eighty',
	'ninety'
	)

	$digitGroups = @(
	'',
	'thousand',
	'million',
	'billion',
	'trillion',
	'quadrillion'
	)

	$ordinalSuffixes = @(
	@{Suffix = 'one'; Replacement = 'first' },
	@{Suffix = 'two'; Replacement = 'second' },
	@{Suffix = 'three'; Replacement = 'third' },
	@{Suffix = 'five'; Replacement = 'fifth' },
	@{Suffix = 'eight'; Replacement = 'eighth' },
	@{Suffix = 'nine'; Replacement = 'ninth' },
	@{Suffix = 'twelve'; Replacement = 'twelfth' },
	@{Suffix = 'ty'; Replacement = 'tieth' }
	)

	$removeCommas = {
	$match = $args[0]
	return $match.value.replace(',', '')
	}

	$expandDecimalPoints = {
	$match = $args[0]
	return $match.Groups[1].Value.replace('.', ' point ')
	}


	$expandPounds = {
	$match = $args[0]
	$parts = $match.Groups[1].Value.split('.')
	if ($parts.Count -gt 2) {
	return match + ' pounds' # Unexpected format
	}
	$pounds = if ($parts[0]) { [int]$parts[0] } else { 0 }
	$pence = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
	if ($pounds -and $pence) {
	$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
	$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' }
	return "$pounds $pound_unit, $pence $pence_unit"
	}
	if ($pounds) {
	$pound_unit = if ($pounds -eq 1) { 'pound' } else { 'pounds' }
	return "$pounds $pound_unit"
	}
	if ($pence) {
	$pence_unit = if ($pence -eq 1) { 'penny' } else { 'pence' }
	return "$pence $pence_unit"
	}
	return 'zero pounds'
	}


	$expandDollars = {
	$match = $args[0]
	$parts = $match.Groups[1].Value.split('.')
	if ($parts.Count -gt 2) {
	return match + ' dollars' # Unexpected format
	}
	$dollars = if ($parts[0]) { [int]$parts[0] } else { 0 }
	$cents = if ($parts.Count -gt 1) { [int]$parts[1] } else { 0 }
	if ($dollars -and $cents) {
	$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
	$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' }
	return "$dollars $dollar_unit, $cents $cent_unit"
	}
	if ($dollars) {
	$dollar_unit = if ($dollars -eq 1) { 'dollar' } else { 'dollars' }
	return "$dollars $dollar_unit"
	}
	if ($cents) {
	$cent_unit = if ($cents -eq 1) { 'cent' } else { 'cents' }
	return "$cents $cent_unit"
	}
	return 'zero dollars'
	}

	$expandOrdinals = {
	$match = $args[0]
	$m = $match.Groups[1].Value
	$n = [Convert]::ToInt32($m)
	$num = Get-NumberToWords($n)
	$found = $false
	$ordinalSuffixes \| ForEach-Object {
	$suffix = $_.Suffix
	$replacement = $_.Replacement
	if ($num.EndsWith($suffix)) {
	$found = $true
	return $num.Substring(0, $num.Length - $suffix.Length) + $replacement
	}
	}
	if (-not $found) {
	return $num + 'th'
	}
	}

	$expandNumber = {
	$match = $args[0]
	$m = $match.Groups[0].Value
	$n = [Convert]::ToInt32($m)
	return Get-NumberToWords($n)
	}


	function Get-NormalizedNumbers($text) {
	$text = $commaNumberRegex.Replace($text, $removeCommas)
	$text = $poundsRegex.Replace($text, $expandPounds)
	$text = $dollarsRegex.Replace($text, $expandDollars)
	$text = $decimalNumberRegex.Replace($text, $expandDecimalPoints)
	$text = $ordinalRegex.Replace($text, $expandOrdinals)
	$text = $numberRegex.Replace($text, $expandNumber)
	return $text
	}

	# List of (regular expression, replacement) pairs for abbreviations:
	$abbreviations = @(
	@{Regex = 'mrs'; Replacement = 'misess' },
	@{Regex = 'mr'; Replacement = 'mister' },
	@{Regex = 'dr'; Replacement = 'doctor' },
	@{Regex = 'st'; Replacement = 'saint' },
	@{Regex = 'co'; Replacement = 'company' },
	@{Regex = 'jr'; Replacement = 'junior' },
	@{Regex = 'maj'; Replacement = 'major' },
	@{Regex = 'gen'; Replacement = 'general' },
	@{Regex = 'drs'; Replacement = 'doctors' },
	@{Regex = 'rev'; Replacement = 'reverend' },
	@{Regex = 'lt'; Replacement = 'lieutenant' },
	@{Regex = 'hon'; Replacement = 'honorable' },
	@{Regex = 'sgt'; Replacement = 'sergeant' },
	@{Regex = 'capt'; Replacement = 'captain' },
	@{Regex = 'esq'; Replacement = 'esquire' },
	@{Regex = 'ltd'; Replacement = 'limited' },
	@{Regex = 'col'; Replacement = 'colonel' },
	@{Regex = 'ft'; Replacement = 'fort' }
	)
	$abbreviations \| ForEach-Object {
	$_.Regex = [regex]"(?i)\b$($_.Regex)\."
	}

	function Expand-Abbreviations($text) {
	$abbreviations \| ForEach-Object {
	$regex = $_.Regex
	$replacement = $_.Replacement
	$text = $regex.Replace($text, $replacement)
	}
	return $text
	}

	function Expand-Numbers {
	return Get-NormalizedNumbers($text)
	}


	function ConvertTo-Lowercase($text) {
	return $text.ToLower()
	}


	function Remove-Whitespace($text) {
	return $text = ([regex]"\s+").Replace($text.Trim(), ' ')
	}


	function ConvertTo-Ascii($text) {
	return (ConvertFrom-Unicode $text.Replace("£","<<pound sign>>")).Replace("<<pound sign>>", "£")
	}


	function Remove-AuxSymbols($text) {
	$text = ([regex]'[\<\>\(\)\[\]\"]+').Replace($text, "")
	return $text
	}


	function Set-Symbols($text) {
	$text = $text.replace(';', ',')
	$text = $text.replace('-', ' ')
	$text = $text.replace(':', ' ')
	$text = $text.replace('&', 'and')
	return $text
	}

	function Invoke-BasicCleaners($text) {
	# Basic pipeline that lowercases and collapses whitespace without transliteration.
	$text = ConvertTo-Lowercase $text
	$text = Remove-Whitespace $text
	return $text
	}

	function Invoke-TransliterationCleaners($text) {
	# Pipeline for non-English text that transliterates to ASCII.
	$text = ConvertTo-Ascii $text
	$text = ConvertTo-Lowercase $text
	$text = Remove-Whitespace $text
	return $text
	}


	function Invoke-EnglishCleaners($text) {
	# Pipeline for English text, including number and abbreviation expansion.
	$text = ConvertTo-Ascii $text
	$text = ConvertTo-Lowercase $text
	$text = Expand-Numbers $text
	$text = Expand-Abbreviations $text
	$text = Set-Symbols $text
	$text = Remove-AuxSymbols $text
	$text = Remove-Whitespace $text
	return $text
	}


	function Invoke-PhonemeCleaners($text) {
	# Pipeline for phonemes mode, including number and abbreviation expansion.
	$text = ConvertTo-Ascii $text
	$text = Expand-Numbers $text
	$text = Expand-Abbreviations $text
	$text = Set-Symbols $text
	$text = Remove-AuxSymbols $text
	$text = Remove-Whitespace $text
	return $text
	}