Skip to content

Instantly share code, notes, and snippets.

@jleedev
Last active April 22, 2026 10:56
Show Gist options
  • Select an option

  • Save jleedev/6f89ee8371e08cfb17e71537fbb3f1ab to your computer and use it in GitHub Desktop.

Select an option

Save jleedev/6f89ee8371e08cfb17e71537fbb3f1ab to your computer and use it in GitHub Desktop.
from collections.abc import Buffer
import io
import os
from pathlib import Path
import subprocess
import textwrap
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.dataset as ds
import matplotlib
import matplotlib.pyplot as plt
def iter_readinto(fh: io.BufferedReader, buf: Buffer):
while (n := fh.readinto1(buf)):
yield memoryview(buf)[:n] if n < len(buf) else buf
def iter_frames(p):
buf = bytearray(4*2**20)
with open(p, 'rb') as fh:
for chunk in iter_readinto(fh, buf):
pb = pa.py_buffer(buf)
arr = pa.Array.from_buffers(pa.uint16(), len(pb) / 2, [None, pb])
rb = pa.record_batch({ 'data': arr })
df = pl.from_arrow(rb)
yield df
def get_total_bytepairs(p):
total = pl.DataFrame(schema=pl.Schema({ 'data': pl.UInt16, 'count': pl.UInt64 }))
for s in iter_frames(p):
count = s.to_series().value_counts()
total = pl.concat([total, count]).group_by('data').agg(pl.sum('count'))
return total
def shape_grid(totals: pl.DataFrame):
return (pl.select(data=pl.arange(65536))
.join(totals, 'data', how='left')
.select('count')
.to_numpy()
.reshape((256, -1)))
def standard_scaler(data):
return (data - np.nanmean(data)) / np.nanstd(data)
def robust_scaler(data):
qs = np.nanpercentile(data, [25, 50, 75])
iqr = qs[2] - qs[0]
if iqr == 0:
iqr = 1
return (data - qs[1]) / (1.5 * iqr)
def byteplot(
p: os.PathLike | str,
transform=lambda d: d,
scaler=robust_scaler,
**kwargs,
) -> matplotlib.image.AxesImage:
if not isinstance(p, os.PathLike):
p = Path(p).expanduser()
tit = subprocess.check_output(['file','-b', p]).decode()
total = get_total_bytepairs(p)
grid = shape_grid(total)
kw = dict(cmap='turbo', vmin=-1, vmax=1)
kw.update(kwargs)
ax = plt.imshow(scaler(transform(grid)), **kw)
if tit != 'data\n':
plt.suptitle(textwrap.fill(tit, 40))
return ax
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment