Created
June 4, 2024 19:28
-
-
Save klinkby/b3b9d2265153c2bfdd2e280bb9acd838 to your computer and use it in GitHub Desktop.
Generate an extract of an XML document, reducing size by skipping elements with given name after a given number has been seen.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.Threading; | |
using System.Threading.Tasks; | |
using System.Xml; | |
using System.Diagnostics.CodeAnalysis; | |
/// <summary> | |
/// Generate an extract of an XML document, reducing size by skipping elements with given name | |
/// after a given number has been seen. Original schema will be retained. | |
/// </summary> | |
public sealed class XmlReducer : IAsyncDisposable, IDisposable | |
{ | |
private const string Xmlns = "xmlns"; | |
private static readonly XmlReaderSettings XmlReaderSettings = new() | |
{ | |
Async = true, | |
CloseInput = false, | |
IgnoreComments = true, | |
IgnoreProcessingInstructions = true, | |
IgnoreWhitespace = true, | |
ConformanceLevel = ConformanceLevel.Document, | |
ValidationType = ValidationType.None | |
}; | |
private static readonly XmlWriterSettings XmlWriterSettings = new() | |
{ | |
Async = true, | |
Encoding = new UTF8Encoding(false), | |
CheckCharacters = false, | |
CloseOutput = false, | |
ConformanceLevel = ConformanceLevel.Document, | |
WriteEndDocumentOnClose = false | |
}; | |
private static readonly RegexOptions RegexOptions = RegexOptions.Compiled | |
| RegexOptions.CultureInvariant | |
| RegexOptions.ExplicitCapture | |
| RegexOptions.NonBacktracking; | |
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.Parse("00:00:01"); | |
private readonly XmlReader _reader; | |
private readonly XmlWriter _writer; | |
private readonly Regex _triggerRegex; | |
private int _skipDepth = int.MaxValue; | |
private int _triggerCount; | |
/// <summary> | |
/// Create a parser context for reducing an XML document. | |
/// </summary> | |
/// <param name="largeXmlDocument">Stream to read from</param> | |
/// <param name="reducedDocument">Stream to write to</param> | |
/// <param name="triggerRegexPattern">Element name pattern to look for</param> | |
/// <param name="maxTriggerCount">Max occurences of the element before skipping</param> | |
public XmlReducer( | |
Stream largeXmlDocument, | |
Stream reducedDocument, | |
[StringSyntax(StringSyntaxAttribute.Regex)] | |
string triggerRegexPattern, | |
int maxTriggerCount = 1) | |
: this( | |
XmlReader.Create(largeXmlDocument, XmlReaderSettings), | |
XmlWriter.Create(reducedDocument, XmlWriterSettings), | |
new Regex(triggerRegexPattern, RegexOptions, RegexMatchTimeout), | |
maxTriggerCount) | |
{ | |
if (ReferenceEquals(largeXmlDocument, reducedDocument)) | |
throw new ArgumentException("Streams must be different", nameof(reducedDocument)); | |
if (largeXmlDocument.CanRead == false) | |
throw new ArgumentException("Stream must be readable", nameof(largeXmlDocument)); | |
if (reducedDocument.CanWrite == false) | |
throw new ArgumentException("Stream must be writable", nameof(reducedDocument)); | |
} | |
/// <summary> | |
/// Create a parser context for reducing an XML document. | |
/// </summary> | |
/// <param name="reader">Read large document</param> | |
/// <param name="writer">Written reduced document</param> | |
/// <param name="triggerRegex">Element name pattern to look for</param> | |
/// <param name="maxTriggerCount">Max occurences of the element before skipping</param> | |
public XmlReducer(XmlReader reader, XmlWriter writer, Regex triggerRegex, int maxTriggerCount = 1) | |
{ | |
if (ReadState.Initial != reader.ReadState) | |
throw new ArgumentException("Reader must be in initial state", nameof(reader)); | |
_reader = reader; | |
_writer = writer; | |
_triggerRegex = triggerRegex; | |
_triggerCount = maxTriggerCount; | |
} | |
/// <summary> | |
/// Read all of input Xml document, copying part of it to the output | |
/// </summary> | |
/// <param name="cancellationToken">A <see cref="CancellationToken" /> used to cancel the operation</param> | |
/// <exception cref="XmlException">Thrown if input is not XML, e.g. if elements are not closed</exception> | |
public async Task ParseAsync(CancellationToken cancellationToken = default) | |
{ | |
while (!cancellationToken.IsCancellationRequested | |
&& await _reader.ReadAsync()) | |
{ | |
if (Skip) continue; | |
var nodeParser = _reader.NodeType switch | |
{ | |
XmlNodeType.Element => ParseElement(), | |
XmlNodeType.Text => ParseText(), | |
XmlNodeType.EndElement => ParseEndElement(), | |
_ => Task.CompletedTask | |
}; | |
await nodeParser; | |
} | |
await _writer.FlushAsync(); | |
} | |
private async Task ParseElement() | |
{ | |
if (IsTrigger) | |
{ | |
_skipDepth = _reader.Depth; | |
return; | |
} | |
await _writer.WriteStartElementAsync(_reader.Prefix, _reader.LocalName, _reader.NamespaceURI); | |
if (!_reader.HasAttributes) return; | |
for (var attInd = 0; attInd < _reader.AttributeCount; attInd++) | |
{ | |
_reader.MoveToAttribute(attInd); | |
if (Xmlns == _reader.Prefix) | |
{ | |
await _writer.WriteAttributeStringAsync(Xmlns, _reader.LocalName, null, _reader.Value); | |
} | |
else | |
{ | |
if (Xmlns != _reader.Name) | |
await _writer.WriteAttributeStringAsync(null, _reader.Name, null, _reader.Value); | |
} | |
} | |
_reader.MoveToElement(); | |
} | |
private Task ParseEndElement() => _writer.WriteEndElementAsync(); | |
private Task ParseText() => _writer.WriteStringAsync(_reader.Value); | |
private bool Skip => _reader.Depth >= _skipDepth; | |
private bool IsTrigger => | |
_triggerRegex.IsMatch(_reader.LocalName) && _triggerCount-- <= 0; | |
public void Dispose() | |
{ | |
_reader.Dispose(); | |
_writer.Dispose(); | |
} | |
public async ValueTask DisposeAsync() | |
{ | |
await _writer.DisposeAsync(); | |
_reader.Dispose(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment