Last active
June 17, 2022 19:06
-
-
Save kgriffs/50162244300d855aecb355067a9b667d to your computer and use it in GitHub Desktop.
Python gzip and lz4 streaming compression example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import io | |
import lz4.frame | |
_DATA_CHUNKS = [ | |
''' | |
<div class="section" id="simple-usage"> | |
<h2>Simple usage<a class="headerlink" href="#simple-usage" title="Permalink to this headline">¶</a></h2> | |
<p>The recommended binding to use is the LZ4 frame format binding, since this | |
provides interoperability with other implementations and language bindings.</p> | |
<p>The simplest way to use the frame bindings is via the <a class="reference internal" href="lz4.frame.html#lz4.frame.compress" title="lz4.frame.compress"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress()</span></code></a> and | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.decompress" title="lz4.frame.decompress"><code class="xref py py-func docutils literal notranslate"><span class="pre">decompress()</span></code></a> functions:</p> | |
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">import</span> <span class="nn">os</span> | |
<span class="gp">>>> </span><span class="kn">import</span> <span class="nn">lz4.frame</span> | |
<span class="gp">>>> </span><span class="n">input_data</span> <span class="o">=</span> <span class="mi">20</span> <span class="o">*</span> <span class="mi">128</span> <span class="o">*</span> <span class="n">os</span><span class="o">.</span><span class="n">urandom</span><span class="p">(</span><span class="mi">1024</span><span class="p">)</span> <span class="c1"># Read 20 * 128kb</span> | |
<span class="gp">>>> </span><span class="n">compressed</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">compress</span><span class="p">(</span><span class="n">input_data</span><span class="p">)</span> | |
<span class="gp">>>> </span><span class="n">decompressed</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">decompress</span><span class="p">(</span><span class="n">compressed</span><span class="p">)</span> | |
<span class="gp">>>> </span><span class="n">decompressed</span> <span class="o">==</span> <span class="n">input_data</span> | |
<span class="go">True</span> | |
</pre></div> | |
</div> | |
''', | |
''' | |
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">import</span> <span class="nn">lz4.frame</span> | |
<span class="gp">>>> </span><span class="kn">import</span> <span class="nn">os</span> | |
<span class="gp">>>> </span><span class="n">input_data</span> <span class="o">=</span> <span class="mi">20</span> <span class="o">*</span> <span class="mi">128</span> <span class="o">*</span> <span class="n">os</span><span class="o">.</span><span class="n">urandom</span><span class="p">(</span><span class="mi">1024</span><span class="p">)</span> | |
<span class="gp">>>> </span><span class="n">c_context</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">create_compression_context</span><span class="p">()</span> | |
<span class="gp">>>> </span><span class="n">compressed</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">compress_begin</span><span class="p">(</span><span class="n">c_context</span><span class="p">)</span> | |
<span class="gp">>>> </span><span class="n">compressed</span> <span class="o">+=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">compress_chunk</span><span class="p">(</span><span class="n">c_context</span><span class="p">,</span> <span class="n">input_data</span><span class="p">[:</span><span class="mi">10</span> <span class="o">*</span> <span class="mi">128</span> <span class="o">*</span> <span class="mi">1024</span><span class="p">])</span> | |
<span class="gp">>>> </span><span class="n">compressed</span> <span class="o">+=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">compress_chunk</span><span class="p">(</span><span class="n">c_context</span><span class="p">,</span> <span class="n">input_data</span><span class="p">[</span><span class="mi">10</span> <span class="o">*</span> <span class="mi">128</span> <span class="o">*</span> <span class="mi">1024</span><span class="p">:])</span> | |
<span class="gp">>>> </span><span class="n">compressed</span> <span class="o">+=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">compress_flush</span><span class="p">(</span><span class="n">c_context</span><span class="p">)</span> | |
</pre></div> | |
</div> | |
''', | |
''' | |
p>Here a compression context is first created which is used to maintain state | |
across calls to the LZ4 library. This is an opaque PyCapsule object. | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_begin" title="lz4.frame.compress_begin"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_begin()</span></code></a> starts a new frame and returns the frame header. | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_chunk" title="lz4.frame.compress_chunk"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_chunk()</span></code></a> compresses input data and returns the compressed data. | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_flush" title="lz4.frame.compress_flush"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_flush()</span></code></a> ends the frame and returns the frame end marker. The | |
data returned from these functions is catenated to form the compressed frame.</p> | |
<p><a class="reference internal" href="lz4.frame.html#lz4.frame.compress_flush" title="lz4.frame.compress_flush"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_flush()</span></code></a> also flushes any buffered data; by default, | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_chunk" title="lz4.frame.compress_chunk"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_chunk()</span></code></a> may buffer data until a block is full. This buffering | |
can be disabled by specifying <code class="docutils literal notranslate"><span class="pre">auto_flush=True</span></code> when calling | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_begin" title="lz4.frame.compress_begin"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_begin()</span></code></a>. Alternatively, the LZ4 buffers can be flushed at any | |
time without ending the frame by calling <a class="reference internal" href="lz4.frame.html#lz4.frame.compress_flush" title="lz4.frame.compress_flush"><code class="xref py py-func docutils literal notranslate"><span class="pre">compress_flush()</span></code></a> with | |
<code class="docutils literal notranslate"><span class="pre">end_frame=False</span></code>.</p> | |
<p>Decompressing data can also be done in a chunked fashion:</p> | |
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">d_context</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">create_decompression_context</span><span class="p">()</span> | |
<span class="gp">>>> </span><span class="n">d1</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">e</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">decompress_chunk</span><span class="p">(</span><span class="n">d_context</span><span class="p">,</span> <span class="n">compressed</span><span class="p">[:</span><span class="nb">len</span><span class="p">(</span><span class="n">compressed</span><span class="p">)</span><span class="o">//</span><span class="mi">2</span><span class="p">])</span> | |
<span class="gp">>>> </span><span class="n">d2</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">e</span> <span class="o">=</span> <span class="n">lz4</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">decompress_chunk</span><span class="p">(</span><span class="n">d_context</span><span class="p">,</span> <span class="n">compressed</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">compressed</span><span class="p">)</span><span class="o">//</span><span class="mi">2</span><span class="p">:])</span> | |
<span class="gp">>>> </span><span class="n">d1</span> <span class="o">+</span> <span class="n">d2</span> <span class="o">==</span> <span class="n">input_data</span> | |
<span class="go">True</span> | |
</pre></div> | |
</div> | |
''', | |
''' | |
<div class="section" id="controlling-the-compression"> | |
<h2>Controlling the compression<a class="headerlink" href="#controlling-the-compression" title="Permalink to this headline">¶</a></h2> | |
<p>Beyond the basic usage described above, there are a number of keyword arguments | |
to tune and control the compression. A few of the key ones are listed below, | |
please see the documentation for full details of options.</p> | |
<div class="section" id="controlling-the-compression-level"> | |
<h3>Controlling the compression level<a class="headerlink" href="#controlling-the-compression-level" title="Permalink to this headline">¶</a></h3> | |
<p>The <code class="docutils literal notranslate"><span class="pre">compression_level</span></code> argument specifies the level of compression used with | |
0 (default) being the lowest compression (0-2 are the same value), and 16 the | |
highest compression. Values below 0 will enable “fast acceleration”, | |
proportional to the value. Values above 16 will be treated as 16. The following | |
module constants are provided as a convenience:</p> | |
<ul class="simple"> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.COMPRESSIONLEVEL_MIN" title="lz4.frame.COMPRESSIONLEVEL_MIN"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.COMPRESSIONLEVEL_MIN</span></code></a>: Minimum compression (0, default)</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.COMPRESSIONLEVEL_MINHC" title="lz4.frame.COMPRESSIONLEVEL_MINHC"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.COMPRESSIONLEVEL_MINHC</span></code></a>: Minimum high-compression mode (3)</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.COMPRESSIONLEVEL_MAX" title="lz4.frame.COMPRESSIONLEVEL_MAX"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.COMPRESSIONLEVEL_MAX</span></code></a>: Maximum compression (16)</li> | |
</ul> | |
<p>Availability: <a class="reference internal" href="lz4.frame.html#lz4.frame.compress" title="lz4.frame.compress"><code class="xref py py-func docutils literal notranslate"><span class="pre">lz4.frame.compress()</span></code></a>, | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.compress_begin" title="lz4.frame.compress_begin"><code class="xref py py-func docutils literal notranslate"><span class="pre">lz4.frame.compress_begin()</span></code></a>, <a class="reference internal" href="lz4.frame.html#lz4.frame.open" title="lz4.frame.open"><code class="xref py py-func docutils literal notranslate"><span class="pre">lz4.frame.open()</span></code></a>, | |
<a class="reference internal" href="lz4.frame.html#lz4.frame.LZ4FrameCompressor" title="lz4.frame.LZ4FrameCompressor"><code class="xref py py-class docutils literal notranslate"><span class="pre">lz4.frame.LZ4FrameCompressor</span></code></a>, <a class="reference internal" href="lz4.frame.html#lz4.frame.LZ4FrameFile" title="lz4.frame.LZ4FrameFile"><code class="xref py py-class docutils literal notranslate"><span class="pre">lz4.frame.LZ4FrameFile</span></code></a>.</p> | |
</div> | |
<div class="section" id="controlling-the-block-size"> | |
<h3>Controlling the block size<a class="headerlink" href="#controlling-the-block-size" title="Permalink to this headline">¶</a></h3> | |
<p>The <code class="docutils literal notranslate"><span class="pre">block_size</span></code> argument specifies the maximum block size to use for the | |
blocks in a frame. Options:</p> | |
<ul class="simple"> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.BLOCKSIZE_DEFAULT" title="lz4.frame.BLOCKSIZE_DEFAULT"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.BLOCKSIZE_DEFAULT</span></code></a> or 0: the lz4 library default</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.BLOCKSIZE_MAX64KB" title="lz4.frame.BLOCKSIZE_MAX64KB"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.BLOCKSIZE_MAX64KB</span></code></a> or 4: 64 kB</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.BLOCKSIZE_MAX256KB" title="lz4.frame.BLOCKSIZE_MAX256KB"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.BLOCKSIZE_MAX256KB</span></code></a> or 5: 256 kB</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.BLOCKSIZE_MAX1MB" title="lz4.frame.BLOCKSIZE_MAX1MB"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.BLOCKSIZE_MAX1MB</span></code></a> or 6: 1 MB</li> | |
<li><a class="reference internal" href="lz4.frame.html#lz4.frame.BLOCKSIZE_MAX4MB" title="lz4.frame.BLOCKSIZE_MAX4MB"><code class="xref py py-obj docutils literal notranslate"><span class="pre">lz4.frame.BLOCKSIZE_MAX4MB</span></code></a> or 7: 4 MB</li> | |
</ul> | |
''', | |
] | |
def test_gzip(): | |
output = io.BytesIO() | |
with gzip.GzipFile('foo', fileobj=output, mode='wb') as gzip_file: | |
for chunk in _DATA_CHUNKS: | |
gzip_file.write(chunk.encode()) | |
return bytes(output.getbuffer()) | |
def test_lz4(): | |
output = io.BytesIO() | |
with lz4.frame.LZ4FrameCompressor() as compressor: | |
output.write(compressor.begin()) | |
for chunk in _DATA_CHUNKS: | |
output.write(compressor.compress(chunk.encode())) | |
output.write(compressor.flush()) | |
return bytes(output.getbuffer()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment