Last active
January 31, 2018 22:58
-
-
Save random82/b9233701d78b046d7c24a603854bc3a4 to your computer and use it in GitHub Desktop.
U-SQL UDO Zip Extrator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Microsoft.Analytics.Interfaces; | |
using System.Collections.Generic; | |
using System.IO; | |
namespace DocExtraction.Udo | |
{ | |
[SqlUserDefinedExtractor(AtomicFileProcessing = true)] | |
public class ZipExtractor : IExtractor | |
{ | |
private readonly bool _textMode; | |
public ZipExtractor(bool textMode = false) | |
{ | |
_textMode = textMode; | |
} | |
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) | |
{ | |
var extractor = new ZipExtractorImpl(_textMode); | |
using (var ms = new MemoryStream()) | |
{ | |
input.BaseStream.CopyTo(ms); | |
foreach (var result in extractor.Extract(ms)) | |
{ | |
output.Set(0, result.FileName); | |
output.Set(1, result.Content); | |
output.Set(2, result.Text); | |
yield return output.AsReadOnly(); | |
} | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.IO.Compression; | |
namespace DocExtraction.Udo | |
{ | |
public class ZipExtractorImpl | |
{ | |
private bool _textMode; | |
public ZipExtractorImpl(bool textMode = false) | |
{ | |
_textMode = textMode; | |
} | |
public IEnumerable<ZipExtractorResult> Extract(Stream stream) | |
{ | |
using (var package = new ZipArchive(stream, ZipArchiveMode.Read)) | |
{ | |
foreach (var entry in package.Entries) | |
{ | |
if (entry.CompressedLength == 0) continue; | |
using (var entryStream = entry.Open()) | |
{ | |
yield return new ZipExtractorResult | |
{ | |
FileName = entry.FullName, | |
Content = _textMode ? ReadText(entryStream) : ReadBase64(entryStream), | |
Text = _textMode | |
}; | |
} | |
} | |
} | |
} | |
private string ReadText(Stream input) | |
{ | |
using (var ms = new MemoryStream()) | |
{ | |
CopyStream(input, ms); | |
ms.Position = 0; | |
using (var sr = new StreamReader(ms)) | |
{ | |
return sr.ReadToEnd(); | |
} | |
} | |
} | |
private static string ReadBase64(Stream input) | |
{ | |
using (var ms = new MemoryStream()) { | |
CopyStream(input, ms); | |
return Convert.ToBase64String(ms.ToArray()); | |
} | |
} | |
private static void CopyStream(Stream source, Stream target) | |
{ | |
const int bufSize = 0x1000; | |
byte[] buf = new byte[bufSize]; | |
int bytesRead = 0; | |
while ((bytesRead = source.Read(buf, 0, bufSize)) > 0) | |
target.Write(buf, 0, bytesRead); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace DocExtraction.Udo | |
{ | |
public class ZipExtractorResult | |
{ | |
public string FileName { get; set; } | |
public string Content { get; set; } | |
public bool Text { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment