Last active
February 8, 2023 12:46
-
-
Save Wimpje/a796ba134d61552587a7 to your computer and use it in GitHub Desktop.
Powershell, split large XML files on node name, with offset support
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
param( [string]$file = $(throw "file is required"), $matchesPerSplit = 50, $maxFiles = [Int32]::MaxValue, $splitOnNode = $(throw "splitOnNode is required"), $offset = 0 ) | |
# with a little help of https://gist.github.com/awayken/5861923 | |
$ErrorActionPreference = "Stop"; | |
trap { | |
$ErrorActionPreference = "Continue" | |
write-error "Script failed: $_ \r\n $($_.ScriptStackTrace)" | |
exit (1); | |
} | |
$file = (resolve-path $file).path | |
$fileNameExt = [IO.Path]::GetExtension($file) | |
$fileNameWithoutExt = [IO.Path]::GetFileNameWithoutExtension($file) | |
$fileNameDirectory = [IO.Path]::GetDirectoryName($file) | |
$reader = [System.Xml.XmlReader]::Create($file) | |
$matchesCount = $idx = 0 | |
try { | |
"Splitting $from on node name='$splitOnNode', with a max of $matchesPerSplit matches per file. Max of $maxFiles files will be generated." | |
$result = $reader.ReadToFollowing($splitOnNode) | |
$hasNextSibling = $true | |
while (-not($reader.EOF) -and $result -and $hasNextSibling -and ($idx -lt $maxFiles + $offset)) { | |
if ($matchesCount -lt $matchesPerSplit) { | |
if($offset -gt $idx) { | |
$idx++ | |
continue | |
} | |
$to = [IO.Path]::Combine($fileNameDirectory, "$fileNameWithoutExt.$($idx -$offset)$fileNameExt") | |
"Writing to $to" | |
$toXml = New-Object System.Xml.XmlTextWriter($to, $null) | |
$toXml.Formatting = 'Indented' | |
$toXml.Indentation = 2 | |
try { | |
$toXml.WriteStartElement("split") | |
$toXml.WriteAttributeString("cnt", $null, "$idx") | |
do { | |
$toXml.WriteRaw($reader.ReadOuterXml()) | |
$matchesCount++; | |
$hasNextSibling = $reader.ReadToNextSibling($splitOnNode) | |
} while($hasNextSibling -and ($matchesCount -lt $matchesPerSplit)) | |
$toXml.WriteEndElement(); | |
} | |
finally { | |
$toXml.Flush() | |
$toXml.Close() | |
} | |
$idx++ | |
$matchesCount = 0; | |
} | |
} | |
} | |
finally { | |
$reader.Close() | |
} |
Hi, many thanks for this code, brilliant!
Could you help me with code line that I can extract the node's id of the used node and use that node's id on the file name rather than incremental $idx, please?
Many thanks!
I dont have time to test right now but the node should be in hear,
$reader.ReadOuterXml())
Not sure if you can extract it directly or if you need to load the content in another object first . It needs to be done without moving the readers position
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think the script only works correctly on XML with "CR"?
Please compare output of sample1 & sample2