-
-
Save metablaster/3ba5e0e94cfe0fb9d02ed6c9eb63f2cf to your computer and use it in GitHub Desktop.
PowerShell scripts for creating and reading test files with the standard Unicode character encoding schemes and default encodings.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Creates test text-based files with various character encodings. | |
.DESCRIPTION | |
Creates text-based test files using | |
* all 5 byte order-marked Unicode character encoding schemes, | |
both with and without BOM (Unicode signature) | |
* default encodings, | |
with the the platform's default encoding, [System.Text.Encoding]::Default | |
and Set-Content's default encoding | |
and Out-File's default encoding | |
The following file types are created for each of these 8 encodings: | |
.txt ... plain-text file | |
.clixml ... a CLI XML file serializing a string | |
.csv ... a CSV file with a single column, 'Value' | |
.psd1 ... a PS data file with a single hashtable entry, 'Value' | |
The base names of the files created with explicit encodings reflect the | |
encoding form, the endianness, and the presence of a BOM: | |
utf-8, utf-16, utf-32 ... encoding form | |
le, be ... little-endian, big-endian | |
B, N ... BOM present, no BOM | |
E.g., 'utf16leB' is a UTF-16 LE-encoded file with BOM. | |
The base names of the files created with default encodings: | |
default ... files created with [System.Text.Encoding]::Default | |
sc-default ... files created with Set-Content's default encoding | |
of-default ... files created with Out-File's default encoding | |
.PARAMETER Text | |
The *single-line* text to save to the test files. | |
The default uses 1 ASCII and 1 non-ASCII letter: | |
'oö', Unicode codepoints U+006F and U+00F6 (UTF-8 encoding: 0xc3 0xb6) | |
.PARAMETER LiteralPath | |
The output directory in which to save the test files; default is ./enc-test/ | |
If the output dir. doesn't exist yet, you are prompted to confirm its creation. | |
#> | |
param( | |
# IMPORTANT: AVOID NON-ASCII STRING LITERALS, BECAUSE | |
# WE CAN'T BE SURE THAT THE ENCLOSING FILE WILL HAVE a UTF-8 BOM | |
# E.G., WHEN DOWNLOADED FROM A Gist (GitHub). | |
# POWERSHELL ITSELF DEFAULTS TO "ANSI" ENCODING WHEN READING A | |
# FILE WITHOUT BOM. | |
# THE FOLLOWING IS THE EQUIVALENT OF 'oö' (lowercase 'o' | |
# (LATIN SMALL LETTER O), | |
# lowercase umlaut-o (LATIN SMALL LETTER O WITH DIAERESIS) | |
[string] $Text = [char[]] (0x6f, 0xf6) -join '', | |
[Alias('Path')] | |
[string] $LiteralPath = './enc-test' | |
) | |
$ErrorActionPreference = 'Stop' | |
$VerbosePreference = 'Continue' | |
# Instantiate encoders and store them in a hashtable whose key reflects | |
# the encoding scheme; the keys are used as the filenames. | |
# NOTE: UTF-7 is not included, because the encoder doesn't offer creating a BOM | |
# and PowerShell doesn't expect one when reading with -Encoding UTF7 | |
# (it returns U+FEFF as a literal char.) | |
# "utf7" = New-Object System.Text.UTF7Encoding $False | |
# "utf7o" = New-Object System.Text.UTF7Encoding $True # allow optional direct chars. | |
# Key legend: | |
# B ... *with* BOM | |
# N ... *no* BOM | |
# le, be ... littl-endian, big-endian | |
$htEncs = [ordered] @{ | |
"utf8B" = New-Object System.Text.UTF8Encoding $True # BOM-or-not | |
"utf8N" = New-Object System.Text.UTF8Encoding $False # BOM-or-not | |
"utf16leB" = New-Object System.Text.UnicodeEncoding $False, $True # big-or-little-endian, BOM-or-not | |
"utf16leN" = New-Object System.Text.UnicodeEncoding $False, $False # big-or-little-endian, BOM-or-not | |
"utf16beB" = New-Object System.Text.UnicodeEncoding $True, $True # big-or-little-endian, BOM-or-not | |
"utf16beN" = New-Object System.Text.UnicodeEncoding $True, $False # big-or-little-endian, BOM-or-not | |
"utf32leB" = New-Object System.Text.UTF32Encoding $False, $True # big-or-little-endian, BOM-or-not | |
"utf32leN" = New-Object System.Text.UTF32Encoding $False, $False # big-or-little-endian, BOM-or-not | |
"utf32beB" = New-Object System.Text.UTF32Encoding $True, $True # big-or-little-endian, BOM-or-not | |
"utf32beN" = New-Object System.Text.UTF32Encoding $True, $False # big-or-little-endian, BOM-or-not | |
"default" = [System.Text.Encoding]::Default | |
"sc-default" = $null # Use Set-Content - note that [System.Text.Encoding]::Default is not the same in PS Core. | |
"of-default" = $null # Use Out-File (which uses UTF-16LE) | |
} | |
# Determine file contents for the various file types, by filename extension. | |
$htTexts = [ordered] @{ | |
'.txt' = $Text | |
# Note: Import-CSV requires field-internal " chars. to be escaped as "", in line with RFC 4180 | |
'.csv' = @" | |
Value | |
"$($Text -replace '"', '""')" | |
"@ | |
'.clixml' = @" | |
<Objs Version="1.1.0.1" xmlns="http://schemas.microsoft.com/powershell/2004/04"> | |
<S><![CDATA[$Text]]></S> | |
</Objs> | |
"@ | |
'.psd1' = @" | |
@{ | |
Value = '$($Text -replace "'", "''")' | |
} | |
"@ | |
} | |
# Determine output path and create output dir. on demand | |
if (-not (Test-Path $LiteralPath)) { # output dir. doesn't exist, create it | |
Write-Host -ForegroundColor Yellow "OK to create output dir.?" | |
if (-not (New-Item -ItemType Directory $LiteralPath -Confirm:$true)) { exit 1 } | |
} | |
# Make sure that the .NET framework uses the same working dir. as PS. | |
[io.directory]::SetCurrentDirectory($PWD.ProviderPath) | |
# Create the files. | |
foreach($ext in $htTexts.Keys) { | |
foreach($name in $htEncs.Keys) { | |
$enc = $htEncs.$name | |
$txt = $htTexts.$ext | |
$fpath = Join-Path $LiteralPath ($name + $ext) | |
write-verbose "Writing to: $fpath" | |
if ($name -eq 'sc-default') { # Use Set-Content | |
Set-Content -Value $txt -NoNewline -LiteralPath $fpath | |
} elseif ($name -eq 'of-default') { # Use Out-File | |
Out-File -InputObject $txt -NoNewline -LiteralPath $fpath | |
} else { | |
[io.file]::WriteAllText($fpath, $txt, $enc) | |
} | |
} | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Tests whether the encoding-test files created by New-EncodingTestFiles are | |
read correctly by various cmdlets. | |
.DESCRIPTION | |
The files created by New-EncodingTestFiles are read with the appropriate | |
cmdlets both with and without the appropriate -Encoding arguments to see | |
whether the files are interpreted correctly in terms of character encoding. | |
Each cmdlet is tested in 2 passes: | |
* Without use of -Encoding, to test default detection. | |
* If supported by the cmdlet, with the -Encoding value matching the input file. | |
The tests in this pass should always succeed, except if a specific encoding | |
isn't supported. | |
.PARAMETER ReferenceText | |
The text to compare the text read from the test files to. | |
The default value matches New-EncodingTestFiles's default. | |
.PARAMETER LiteralPath | |
The path to the directory in which the files created by New-EncodingTestFiles | |
are located. | |
The default value matches New-EncodingTestFiles's default. | |
#> | |
[CmdletBinding()] | |
param( | |
# IMPORTANT: AVOID NON-ASCII STRING LITERALS, BECAUSE | |
# WE CAN'T BE SURE THAT THE ENCLOSING FILE WILL HAVE a UTF-8 BOM | |
# E.G., WHEN DOWNLOADED FROM A Gist (GitHub). | |
# POWERSHELL ITSELF DEFAULTS TO "ANSI" ENCODING WHEN READING A | |
# FILE WITHOUT BOM. | |
# THE FOLLOWING IS THE EQUIVALENT OF 'oö' (lowercase 'o' | |
# (LATIN SMALL LETTER O), | |
# lowercase umlaut-o (LATIN SMALL LETTER O WITH DIAERESIS) | |
[string] $ReferenceText = [char[]] (0x6f, 0xf6) -join '', # Matches New-EncodingTestFiles's default | |
# The path | |
[string] $LiteralPath = './enc-test' # Matches New-EncodingTestFiles's default | |
) | |
# ENSURE THAT THIS FILE IS UTF-8-ENCODED *WITH* A BOM - otherwise PowerShell | |
# will not interpret it correctly. | |
function Get-CodePointList([string] $Text) { | |
<# | |
.SYNOPSIS | |
Converts a string to a single-line list (string) of the characters' Unicode | |
codepoints in hex. format; e.g., 'oö' -> '0x6f 0xf6' | |
.EXAMPLE | |
> Get-CodePointList 'oö' | |
0x6f 0xf6 | |
#> | |
switch ($Text) { | |
$null { '(null)'; break } | |
'' { '(empty)'; break } | |
Default { | |
[string] ([int[]] $Text.ToCharArray() | ForEach-Object { '0x{0:x2}' -f $_ }) | |
} | |
} | |
} | |
# Note: This is also necessary to make the try/catch handlers work. | |
$ErrorActionPreference = 'Stop' | |
# The filename extension and what cmdlet(s) to load them with. | |
# Note that Import-PowerShellDataFile and Import-Clixml do not support the | |
# -Encoding parameter | |
$htExts = [ordered] @{ | |
'.txt' = @{ cmdletName = 'Get-Content' }, | |
@{ cmdletName = 'Select-String'; fixedParams = @{ Pattern = $ReferenceText; SimpleMatch = $true } } | |
'.csv' = @{ cmdletName = 'Import-Csv' } | |
'.psd1' = @{ cmdletName = 'Import-PowerShellDataFile' } | |
'.clixml' = @{ cmdletName = 'Import-Clixml' } | |
} | |
# Map the filename roots to the corresponding -Encoding parameter values. | |
$htEncodingNames = @{ | |
utf8 = 'utf8' | |
utf16le = 'Unicode' | |
utf16be = 'BigEndianUnicode' | |
utf32le = 'UTF32' | |
utf32be = 'BigEndianUTF32' | |
'default' = 'Default' | |
'sc-default' = 'Default' | |
'of-default' = 'Unicode' | |
} | |
# Loop over all file types | |
foreach ($ext in $htExts.Keys) { | |
$cmdDefs = $htExts.$ext | |
$files = Get-Item -Path "./enc-test/*$ext" | |
Write-Verbose "============= $ext" | |
# Read with and without -Encoding parameter. | |
foreach ($cmdDef in $cmdDefs) { | |
$cmd = $cmdDef.cmdletName | |
$htParams = $cmdDef.fixedParams | |
if (-not $htParams) { $htParams = @{} } | |
for ($pass=1; $pass -le 2; ++$pass) { | |
# 2nd pass: See if the cmdlet even supports -Encoding and skip, if | |
# not. | |
if ($pass -eq 2) { | |
if (-not (Get-Command $cmd).Parameters.ContainsKey('Encoding')) { | |
Write-Verbose "== ($cmd doesn't have an -Encoding parameter)" | |
break | |
} | |
} | |
Write-Verbose "== Using $cmd $(if ($pass -eq 1) { 'WITHOUT' } else { 'WITH appropriate' }) -Encoding parameter:" | |
$htEncodingParamIfAny = @{} | |
foreach ($file in $files) { | |
$encName = '(default)' | |
if ($pass -eq 2) { | |
$encName = $htEncodingNames.$($file.BaseName -replace '[NB]$') | |
$htParams.Encoding = $encName # Set -Encoding argument | |
} | |
$exceptionText = '' | |
try { | |
$content = & $cmd $file.FullName @htParams | |
} | |
catch { | |
$exceptionText = "$_" | |
if ($_.Exception.ParameterName -eq 'Encoding') { | |
$result = "NOT SUPPORTED: $encName" | |
} else { | |
$result = "ERROR" | |
} | |
} | |
if ($exceptionText) { # reading failed | |
Write-Verbose "${encName}, ${cmd}: exception occurred: $exceptionText" | |
} else { # reading succeeded, but encoding may not be correct | |
# Extract the string value to test from the *object* that some of the | |
# cmdlets return. | |
if ($content -isnot [string]) { | |
if ($cmd -eq 'Select-String') { # The matched line is in the .Line property | |
$content = $content.Line | |
} else { # all others have a .Value property | |
$content = $content.Value | |
} | |
} | |
Write-Verbose "${encName}, ${cmd}: value: [$content]; bytes: $(Get-CodePointList $content)" | |
if ($null -eq $content) { # value could not be read | |
$result = "NOTHING READ" | |
} else { # make sure that was read matches the reference text codepoint by codepoint | |
$codePoints = [int[]] $content.ToCharArray() | |
$result = $codePoints.Count -eq 2 -and $content -eq $ReferenceText | |
$result = ('INCORRECTLY DECODED', 'ok')[$result] | |
} | |
} | |
[pscustomobject] @{ Cmdlet = $cmd; Method = ('-Encoding', 'Auto')[$pass -eq 1]; FileName = $file.Name; Result = $result } # | Out-Default | |
} | |
} # foreach $pass | |
} # foreach $cmd | |
} # foreach $ext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment