Last active
November 1, 2022 19:56
-
-
Save indented-automation/8e603144167c7acca4dd8f653d47441e to your computer and use it in GitHub Desktop.
Signature-based encoding detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using namespace System.Collections.Generic; using namespace System.Linq | |
function Get-FileEncoding { | |
<# | |
.SYNOPSIS | |
Attempt to determine a file type based on a BOM or file header. | |
.DESCRIPTION | |
This script attempts to determine file types based on a byte sequence at the beginning of the file. | |
If an identifiable byte sequence is not present the file type cannot be determined using this method. | |
The order signatures appear in is critical where signatures overlap. For example, UTF32-LE must be evaluated before UTF16-LE. | |
.LINK | |
https://en.wikipedia.org/wiki/Byte_order_mark#cite_note-b-15 | |
https://filesignatures.net | |
#> | |
[CmdletBinding()] | |
[OutputType('EncodingInfo')] | |
param ( | |
# The path to a file to analyze. | |
[Parameter(Mandatory, Position = 1, ValueFromPipeline, ValueFromPipelineByPropertyName)] | |
[ValidateScript( { Test-Path $_ -PathType Leaf } )] | |
[Alias('FullName')] | |
[String]$Path, | |
# Test the file against a small set of signature definitions for binary file types. | |
# | |
# Identification should be treated as tentative. Several file formats cannot be identified using the sequence at the start alone. | |
[Switch]$IncludeBinary | |
) | |
begin { | |
$signatures = [Ordered]@{ | |
'UTF32-LE' = 'FF-FE-00-00' | |
'UTF32-BE' = '00-00-FE-FF' | |
'UTF8' = 'EF-BB-BF' | |
'UTF16-LE' = 'FF-FE' | |
'UTF16-BE' = 'FE-FF' | |
'UTF7' = '2B-2F-76-38', '2B-2F-76-39', '2B-2F-76-2B', '2B-2F-76-2F' | |
'UTF1' = 'F7-64-4C' | |
'UTF-EBCDIC' = 'DD-73-66-73' | |
'SCSU' = '0E-FE-FF' | |
'BOCU-1' = 'FB-EE-28' | |
'GB-18030' = '84-31-95-33' | |
} | |
if ($IncludeBinary) { | |
$signatures += [Ordered]@{ | |
'LNK' = '4C-00-00-00-01-14-02-00' | |
'MSEXCEL' = '50-4B-03-04-14-00-06-00' | |
'PNG' = '89-50-4E-47-0D-0A-1A-0A' | |
'MSOFFICE' = 'D0-CF-11-E0-A1-B1-1A-E1' | |
'7ZIP' = '37-7A-BC-AF-27-1C' | |
'RTF' = '7B-5C-72-74-66-31' | |
'GIF' = '47-49-46-38' | |
'REGPOL' = '50-52-65-67' | |
'GZIP' = '1F-8B' | |
'JPEG' = 'FF-D8' | |
'MSEXE' = '4D-5A' | |
'ZIP' = '50-4B' | |
} | |
} | |
# Convert sequence strings to byte arrays. Intended to simplify signature maintenance. | |
[String[]]$keys = $signatures.Keys | |
foreach ($name in $keys) { | |
[List[List[Byte]]]$values = foreach ($value in $signatures[$name]) { | |
[List[Byte]]$signatureBytes = foreach ($byte in $value.Split('-')) { | |
[Convert]::ToByte($byte, 16) | |
} | |
,$signatureBytes | |
} | |
$signatures[$name] = $values | |
} | |
} | |
process { | |
try { | |
$Path = $pscmdlet.GetUnresolvedProviderPathFromPSPath($Path) | |
$bytes = [Byte[]]::new(8) | |
$stream = [System.IO.File]::OpenRead($Path) | |
$null = $stream.Read($bytes, 0, $bytes.Count) | |
$bytes = [List[Byte]]$bytes | |
$stream.Close() | |
$encoding = foreach ($name in $signatures.Keys) { | |
$sampleEncoding = foreach ($sequence in $signatures[$name]) { | |
$sample = $bytes.GetRange(0, $sequence.Count) | |
if ([System.Linq.Enumerable]::SequenceEqual($sample, $sequence)) { | |
$name | |
break | |
} | |
} | |
if ($sampleEncoding) { | |
$sampleEncoding | |
break | |
} | |
} | |
[PSCustomObject]@{ | |
Name = Split-Path $Path -Leaf | |
Extension = [System.IO.Path]::GetExtension($Path) | |
Encoding = $encoding | |
Path = $Path | |
PSTypeName = 'EncodingInfo' | |
} | |
} catch { | |
$pscmdlet.WriteError($_) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment