-
-
Save jpoehls/2406504 to your computer and use it in GitHub Desktop.
<# | |
.SYNOPSIS | |
Converts files to the given encoding. | |
Matches the include pattern recursively under the given path. | |
.EXAMPLE | |
Convert-FileEncoding -Include *.js -Path scripts -Encoding UTF8 | |
#> | |
function Convert-FileEncoding([string]$Include, [string]$Path, [string]$Encoding='UTF8') { | |
$count = 0 | |
Get-ChildItem -Include $Pattern -Recurse -Path $Path ` | |
| select FullName, @{n='Encoding';e={Get-FileEncoding $_.FullName}} ` | |
| where {$_.Encoding -ne $Encoding} ` | |
| % { (Get-Content $_.FullName) ` | |
| Out-File $_.FullName -Encoding $Encoding; $count++; } | |
Write-Host "$count $Pattern file(s) converted to $Encoding in $Path." | |
} | |
# http://franckrichard.blogspot.com/2010/08/powershell-get-encoding-file-type.html | |
<# | |
.SYNOPSIS | |
Gets file encoding. | |
.DESCRIPTION | |
The Get-FileEncoding function determines encoding by looking at Byte Order Mark (BOM). | |
Based on port of C# code from http://www.west-wind.com/Weblog/posts/197245.aspx | |
.EXAMPLE | |
Get-ChildItem *.ps1 | select FullName, @{n='Encoding';e={Get-FileEncoding $_.FullName}} | where {$_.Encoding -ne 'ASCII'} | |
This command gets ps1 files in current directory where encoding is not ASCII | |
.EXAMPLE | |
Get-ChildItem *.ps1 | select FullName, @{n='Encoding';e={Get-FileEncoding $_.FullName}} | where {$_.Encoding -ne 'ASCII'} | foreach {(get-content $_.FullName) | set-content $_.FullName -Encoding ASCII} | |
Same as previous example but fixes encoding using set-content | |
# Modified by F.RICHARD August 2010 | |
# add comment + more BOM | |
# http://unicode.org/faq/utf_bom.html | |
# http://en.wikipedia.org/wiki/Byte_order_mark | |
# | |
# Do this next line before or add function in Profile.ps1 | |
# Import-Module .\Get-FileEncoding.ps1 | |
#> | |
function Get-FileEncoding | |
{ | |
[CmdletBinding()] | |
Param ( | |
[Parameter(Mandatory = $True, ValueFromPipelineByPropertyName = $True)] | |
[string]$Path | |
) | |
[byte[]]$byte = get-content -Encoding byte -ReadCount 4 -TotalCount 4 -Path $Path | |
#Write-Host Bytes: $byte[0] $byte[1] $byte[2] $byte[3] | |
# EF BB BF (UTF8) | |
if ( $byte[0] -eq 0xef -and $byte[1] -eq 0xbb -and $byte[2] -eq 0xbf ) | |
{ Write-Output 'UTF8' } | |
# FE FF (UTF-16 Big-Endian) | |
elseif ($byte[0] -eq 0xfe -and $byte[1] -eq 0xff) | |
{ Write-Output 'Unicode UTF-16 Big-Endian' } | |
# FF FE (UTF-16 Little-Endian) | |
elseif ($byte[0] -eq 0xff -and $byte[1] -eq 0xfe) | |
{ Write-Output 'Unicode UTF-16 Little-Endian' } | |
# 00 00 FE FF (UTF32 Big-Endian) | |
elseif ($byte[0] -eq 0 -and $byte[1] -eq 0 -and $byte[2] -eq 0xfe -and $byte[3] -eq 0xff) | |
{ Write-Output 'UTF32 Big-Endian' } | |
# FE FF 00 00 (UTF32 Little-Endian) | |
elseif ($byte[0] -eq 0xfe -and $byte[1] -eq 0xff -and $byte[2] -eq 0 -and $byte[3] -eq 0) | |
{ Write-Output 'UTF32 Little-Endian' } | |
# 2B 2F 76 (38 | 38 | 2B | 2F) | |
elseif ($byte[0] -eq 0x2b -and $byte[1] -eq 0x2f -and $byte[2] -eq 0x76 -and ($byte[3] -eq 0x38 -or $byte[3] -eq 0x39 -or $byte[3] -eq 0x2b -or $byte[3] -eq 0x2f) ) | |
{ Write-Output 'UTF7'} | |
# F7 64 4C (UTF-1) | |
elseif ( $byte[0] -eq 0xf7 -and $byte[1] -eq 0x64 -and $byte[2] -eq 0x4c ) | |
{ Write-Output 'UTF-1' } | |
# DD 73 66 73 (UTF-EBCDIC) | |
elseif ($byte[0] -eq 0xdd -and $byte[1] -eq 0x73 -and $byte[2] -eq 0x66 -and $byte[3] -eq 0x73) | |
{ Write-Output 'UTF-EBCDIC' } | |
# 0E FE FF (SCSU) | |
elseif ( $byte[0] -eq 0x0e -and $byte[1] -eq 0xfe -and $byte[2] -eq 0xff ) | |
{ Write-Output 'SCSU' } | |
# FB EE 28 (BOCU-1) | |
elseif ( $byte[0] -eq 0xfb -and $byte[1] -eq 0xee -and $byte[2] -eq 0x28 ) | |
{ Write-Output 'BOCU-1' } | |
# 84 31 95 33 (GB-18030) | |
elseif ($byte[0] -eq 0x84 -and $byte[1] -eq 0x31 -and $byte[2] -eq 0x95 -and $byte[3] -eq 0x33) | |
{ Write-Output 'GB-18030' } | |
else | |
{ Write-Output 'ASCII' } | |
} |
Here is an alternative, the value of $Encoding
may be 'utf-8'. The hyphen is removed (if there is one) for that it may be passed in for instance Out-File
.
$File = Get-Item .\src\app\app.module.ts
$Encoding = New-Object -TypeName System.IO.StreamReader -ArgumentList $File.FullName -OutVariable Stream | `
Select-Object -ExpandProperty CurrentEncoding | `
Select-Object -ExpandProperty BodyName
$Stream.Dispose()
if ($Encoding.Contains('-')) {
$Encoding = $Encoding.Replace('-', '')
}
From PowerShell core -Encoding Byte
is no longer a valid option. It has been replaced by a new parameter named -AsByteStream
:
I like your script for id'ing the encoding. It would be nice if it could identify utf8nobom and "ansi", whatever that is. Notepad and notepad++ seem to be able to recognize both. In PS 6, set-content can save as utf8nobom. Strangely the docs say the default in set-content for PS 5 is ascii, but it's more like "ansi" (or utf8nobom in ps6). Note that out-file and tee in ps 5 save as "unicode". Reliable ways to detect ascii and utf8 (no bom): https://unicodebook.readthedocs.io/guess_encoding.html
There is one error in the 11th line, the "$pattern" should be "$include". But still, thanks for your help.
You should look into the UTF8 spec and other detection algorithms. Relying on a BOM is not accurate.