Last active
September 19, 2023 15:27
-
-
Save sean-m/7dd2d6f044f935c84791147cc244f9df to your computer and use it in GitHub Desktop.
PowerShell module with functions to assist reading and writting GZip compressed files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Requires -Version 4 | |
#Load Types | |
{ | |
if (-not ([System.Management.Automation.PSTypeName]'CompressHelper').Type) { | |
Add-Type -Language CSharp -TypeDefinition @" | |
using System; | |
using System.IO; | |
using System.Threading.Tasks; | |
public static class CompressHelper { | |
public static void CopyStream (Stream src, Stream dest) { | |
CopyStreamAsync(src, dest).Wait(); // Faster than just using CopyTo() | |
} | |
public static Task CopyStreamAsync (Stream src, Stream dest) { | |
return src.CopyToAsync(dest); | |
} | |
} | |
"@ | |
} | |
}.Invoke() | |
<# Copy-Stream | |
.SYNOPSIS | |
Stream.CopyTo(stream) method causing problems in PowerShell is nonsense. | |
.DESCRIPTION | |
Utter nonsense. This does that with a blocking call to CopyToAsync, in | |
the case of large files while using compression the async variant can execute | |
twice as fast although the function explicitly blocks. | |
.PARAMETER SrcStream | |
Stream your data is coming from. | |
.PARAMETER DestStream | |
Stream your data is going to. | |
#> | |
function Copy-Stream { | |
param ( | |
[IO.Stream]$SrcStream, | |
[IO.Stream]$DestStream | |
) | |
if (-not $SrcStream.CanRead) { throw [System.IO.IOException] "Cannot read from source stream." } | |
if (-not $DestStream.CanWrite) { throw [System.IO.IOException] "Cannot write to destination stream." } | |
Write-Debug "Copying streams" | |
[CompressHelper]::CopyStream($SrcStream, $DestStream) | |
} | |
Export-ModuleMember -Function Copy-Stream | |
<# Out-GzipFile | |
.SYNOPSIS | |
Writes incoming data to a gzip compressed file. Note: CompressionMode=Optimal | |
slowest compresion but optimal space savings. GZip semi-analog to Out-File, sans encoding. | |
.DESCRIPTION | |
Writes incoming data to a gzip compressed file. Note: CompressionMode=Optimal | |
slowest compresion but optimal space savings. Incoming data may be a object/object[] | |
(.ToString() values are written to GZip stream) or Stream. | |
*Update: punting on pipeline input until I can figure out how to avoid tearing down the output | |
stream on each itteration. | |
.PARAMETER InputData | |
Incoming data may be a object/object[] (.ToString() values are written to GZip stream) or Stream. | |
Note: Stream input is processed much faster as object/string types are written syncronously while | |
Stream types are an asyncrouns buffered copy by invoking Copy-Stream. | |
.PARAMETER FilePath | |
Path to written GZip compressed file. | |
.PARAMETER Force | |
Force overwriting destination file. | |
.EXAMPLE | |
Out-GzipFile -Source [System.IO.File]::OpenRead("Example.txt") -FilePath "Example.txt.gz" -Force | |
.EXAMPLE | |
Out-GzipFile -Source @(Get-Content "Example.txt") -FilePath "Example.txt.gz" -Force | |
.EXAMPLE | |
$request = [FtpWebRequest]WebRequest.Create($serverUri) | |
$request.Method = [Net.WebRequestMethods.FtpWebRequestMethods+Ftp]::DownloadFile | |
[Net.FtpWebResponse]$response = request.GetResponse(); | |
Out-GzipFile -Source ($response.GetResponseStream()) -FilePath "ftp_data.txt.gz" | |
#> | |
function Out-GzipFile { | |
[CmdletBinding()] | |
[Alias()] | |
[OutputType([int])] | |
param (# Data to compress | |
[Parameter(Mandatory=$true, | |
ValueFromPipeline=$false, | |
ValueFromPipelineByPropertyName=$true, | |
Position=0)] | |
$InputData, | |
# File to write to | |
[Parameter(Mandatory=$true, | |
ValueFromPipeline=$false, | |
ValueFromPipelineByPropertyName=$true, | |
Position=1)] | |
[string]$FilePath, | |
# Overwrite destination if it exists | |
[Parameter(Mandatory=$false, | |
ValueFromPipeline=$false, | |
ValueFromPipelineByPropertyName=$true, | |
Position=2)] | |
[switch]$Force, | |
[Parameter(Mandatory=$false, | |
ValueFromPipeline=$false, | |
ValueFromPipelineByPropertyName=$true, | |
Position=4)] | |
[switch]$Verify) | |
begin { | |
# Initialize variables | |
$destPath = $FilePath | |
$exists = Test-Path $destPath | |
if ($exists) { | |
# Get full path | |
$local:f = (Resolve-Path $destPath).Path | |
if (-not [String]::IsNullOrWhiteSpace($local:f)) { | |
$destPath = $local:f | |
} | |
} | |
if ($Append -and $Force) { throw "Append and Force switches cannot be used together" } | |
# Set Stream mode | |
$mode = $null | |
if ($exists -and $Force) { $mode = 'Truncate' } | |
elseif ($exists -and $Append) { $mode = 'Append' } | |
elseif ($exists) { | |
throw [System.IO.IOException] "File exists and `$Force flag not passed. Quiting.." | |
} | |
else { $mode = 'CreateNew' } | |
## Setup streams | |
$local:lastStream = $null | |
$local:fs = New-Object System.IO.FileStream $destPath, $mode, 'Write', 'None' | |
$local:gz = New-Object System.IO.Compression.GZipStream $fs, ([System.IO.Compression.CompressionLevel]::Optimal) | |
$local:lastStream = $local:gz | |
$local:sw = New-Object System.IO.StreamWriter $gz | |
$local:cryptStream = $null | |
} | |
process { | |
$task = $null | |
$local:hashAlg = $null | |
$s_hash = "" | |
$d_hash = "" | |
try { | |
$local:basetype = $InputData.GetType().BaseType | |
if ($local:basetype -like [Array]) { | |
## Itterate through array | |
foreach ($i in $InputData) { | |
if ($task -eq $null) { | |
$task = $local:sw.WriteLineAsync($i.ToString()) | |
} | |
else { | |
$task.Wait() | |
$task = $local:sw.WriteLineAsync($i.ToString()) | |
} | |
} | |
} | |
elseif ($local:basetype -like [IO.Stream]) { | |
if ($Verify) { | |
## Verify data integrity with SHA256 hash | |
# Initialize crypto stream to generate running hash | |
$local:hashAlg = New-Object System.Security.Cryptography.SHA256Cng | |
$local:cryptStream = New-Object System.Security.Cryptography.CryptoStream $local:gz, $local:hashAlg, 'Write' | |
$local:lastStream = $local:cryptStream | |
# Copy stream | |
Copy-Stream $InputData $local:cryptStream | |
$local:cryptStream.FlushFinalBlock() | |
$s_hash = [System.BitConverter]::ToString($local:hashAlg.Hash) | |
$ds = Get-GzipContent -ReturnStream -FilePath $destPath | |
$d_alg = New-Object System.Security.Cryptography.SHA256Cng | |
$d_bytes = $d_alg.ComputeHash($ds) | |
$d_hash = [System.BitConverter]::ToString($d_bytes) | |
$ds.Close() | |
} | |
else { | |
## Do not compute hash, just copy stream data | |
Copy-Stream $InputData $local:lastStream | |
} | |
} | |
else { | |
if ($task -eq $null) { | |
$task = $local:sw.WriteLineAsync($InputData.ToString()) | |
} | |
else { | |
$task.Wait() | |
$task = $local:sw.WriteLineAsync($InputData.ToString()) | |
} | |
} | |
} | |
catch { throw } | |
finally { | |
if ($local:lastStream) { | |
$local:lastStream.Close() | Out-Null | |
} | |
} | |
if ($Verify) { | |
return "Verified: $FilePath - $($s_hash -like $d_hash)" | |
} | |
} | |
} | |
Export-ModuleMember -Function Out-GzipFile | |
function Decompress-GzipFile { | |
[CmdletBinding()] | |
[Alias()] | |
[OutputType([int])] | |
Param | |
( | |
[Parameter(Mandatory=$true, | |
Position=0)] | |
[string]$FilePath, | |
[Parameter(Mandatory=$true, | |
Position=1)] | |
[string]$OutFile, | |
[switch]$Force | |
) | |
process { | |
try { | |
$local:src = Get-GzipContent -FilePath (Resolve-Path $FilePath) -ReturnStream | |
if ($OutFile.StartsWith(".")) { | |
$OutFile = (Join-Path -Path (pwd).Path -ChildPath $OutFile).Replace("\.\","\").Replace("/./","/") | |
} | |
if ((Test-Path $OutFile) -and -not $Force) { | |
throw "File exists and -Force switch not set." | |
} | |
$local:out = [System.IO.File]::Create($OutFile) | |
Write-Host "Decompressing: $FilePath -> $OutFile" | |
Copy-Stream -SrcStream $src -DestStream $out | |
} | |
finally { | |
if ($local:out) { $out.Close() } | |
if ($local:src) { $src.Close() } | |
} | |
} | |
} | |
Export-ModuleMember -Function Decompress-GzipFile | |
<# Get-GzipContent | |
.Synopsis | |
Read in data from GZip compressed file. | |
.DESCRIPTION | |
Read in data from GZip compressed file. Returns content line by line or | |
optionally returns a Stream object initialized with GZip decompress stream adapter. | |
.PARAMETER FilePath | |
.PARAMETER ReturnStream | |
.EXAMPLE | |
Get-GzipContent -FilePath ".\NYSE-2000-2001.tsv.gz" | ConvertFrom-Csv -Delimiter "`t" | |
.EXAMPLE | |
Get-GzipContent -FilePath ".\NYSE-2000-2001.tsv.gz" -ReturnStream | |
#> | |
function Get-GzipContent | |
{ | |
[CmdletBinding()] | |
[Alias()] | |
[OutputType([int])] | |
Param | |
( | |
[Parameter(Mandatory=$true, | |
Position=0)] | |
$FilePath, | |
[switch]$ReturnStream | |
) | |
Process | |
{ | |
$local:srcFile = (Resolve-Path $FilePath).Path | |
$local:fs = [IO.File]::OpenRead($local:srcFile) | |
$local:gz = New-Object IO.Compression.GZipStream $local:fs, ([System.IO.Compression.CompressionMode]::Decompress) | |
if ($ReturnStream) { | |
Write-Debug "Returned stream does not close or dispose automatically." | |
return $local:gz | |
} | |
$local:sr = New-Object IO.StreamReader $local:gz | |
try { | |
while (-not $local:sr.EndOfStream) { | |
$local:sr.ReadLine() | |
} | |
} | |
catch { throw $_ } | |
finally { $local:sr.Dispose() } | |
} | |
} | |
Export-ModuleMember -Function Get-GzipContent | |
<# Copy-ToGzipFile | |
.Synopsis | |
Copy an existing file into a GZip compressed file. | |
.DESCRIPTION | |
Copies an existing file into a GZip compressed file with the option to overwrite an | |
existing file and/or remove the source file. Good for archiving existing files. | |
.PARAMETER Source | |
Source file stream or file path | |
.PARAMETER Destination | |
Destination GZip file | |
.PARAMETER RemoveSource | |
Delete source file if completed | |
.PARAMETER Force | |
Overwrite destination if exists | |
.PARAMETER Verify | |
Compute SHA256 checksum of incoming file and verify against destination | |
.EXAMPLE | |
Get-ChildItem *.txt | Copy-ToGzipFile -RemoveSource -Verify | |
.EXAMPLE | |
.\BigCsvFile.csv | Copy-ToGzipFile -Destination .\LittleCsvFile.csv.gz | |
#> | |
function Copy-ToGzipFile | |
{ | |
[CmdletBinding()] | |
[Alias()] | |
[OutputType([int])] | |
Param | |
( | |
# Source file stream or file path | |
[Parameter(Mandatory=$true, | |
ValueFromPipeline=$true, | |
ValueFromPipelineByPropertyName=$true, | |
Position=0)] | |
$Source, | |
# Destination GZip file | |
[Parameter(Mandatory=$false, | |
ValueFromPipelineByPropertyName=$true, | |
Position=1)] | |
[string]$Destination, | |
# Check source and destination data with SHA256 checksum | |
[switch]$Verify, | |
# Delete source file if completed | |
[switch]$RemoveSource, | |
# Overwrite destination if exists | |
[switch]$Force | |
) | |
Begin { | |
if ($RemoveSource -and -not $Verify) { | |
Write-Warning "RemoveSource must be used with data validation or it may result in data loss." | |
$RemoveSource = $false | |
} | |
} | |
Process | |
{ | |
$local:fs = $null | |
$local:basetype = $Source.GetType().BaseType | |
$local:srcPath = "" | |
$local:destPath = "" | |
if ($local:basetype -like [IO.Stream]) { | |
$local:fs = $Source | |
} | |
elseif ($local:basetype -like [System.IO.FileSystemInfo]) { | |
$local:srcPath = $Source.FullName | |
$local:fs = [System.IO.File]::OpenRead($local:srcPath) | |
} | |
else { | |
$local:srcPath = $Source | |
if (-not (Test-Path $local:srcPath)) { $local:srcPath = Join-Path -Path $(Get-Location) -ChildPath $Source } | |
if (-not (Test-Path $local:srcPath)) { throw [System.IO.FileNotFoundException] "Cannot find `$Source: $local:srcPath" } | |
$local:fs = [System.IO.File]::OpenRead($local:srcPath) | |
} | |
$destPath = $Destination | |
if ([String]::IsNullOrEmpty($Destination) -and ($local:basetype -like [IO.Stream])) { | |
throw "Source is Stream and destination not specified, cannot infer destination path." | |
} | |
elseif ([String]::IsNullOrEmpty($destPath)) { | |
if (-not (Test-Path "$local:srcPath")) { throw [System.IO.FileNotFoundException] "Cannot find `$Source: $local:srcPath" } | |
else { $destPath = "$local:srcPath.gz" } | |
} | |
Write-Verbose "Source file: $local:srcPath" | |
Write-Verbose "Creating compressed file: $destPath" | |
$result = $null | |
try { | |
$result = Out-GzipFile -InputData $local:fs -FilePath $destPath -Force:$Force -Verbose:$Verbose -Verify:$Verify | |
if ($Verify -and ($result -ne $null)) { | |
New-Object PSObject -Property @{"Source"=$srcPath;"Destination"=$destPath;"VerifySuccess"=$result} | |
} | |
} | |
catch { throw } | |
finally { $local:fs.Close() } | |
if ($RemoveSource -and ($local:basetype -like [IO.Stream])) { | |
Write-Warning "Input type is Stream and `$RemoveSource flag used, cannot infer source file from Stream type." | |
} | |
elseif ($RemoveSource -and $result) { | |
Write-Verbose 'Checksum verified, removing source file.' | |
Remove-Item -Path $local:srcPath -Force:$Force | |
} | |
Write-Verbose "Complete" | |
} | |
} | |
Export-ModuleMember -Function Copy-ToGzipFile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment