Last active
April 22, 2026 10:56
-
-
Save jleedev/6f89ee8371e08cfb17e71537fbb3f1ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections.abc import Buffer | |
| import io | |
| import os | |
| from pathlib import Path | |
| import subprocess | |
| import textwrap | |
| import numpy as np | |
| import pandas as pd | |
| import polars as pl | |
| import pyarrow as pa | |
| import pyarrow.dataset as ds | |
| import matplotlib | |
| import matplotlib.pyplot as plt | |
| def iter_readinto(fh: io.BufferedReader, buf: Buffer): | |
| while (n := fh.readinto1(buf)): | |
| yield memoryview(buf)[:n] if n < len(buf) else buf | |
| def iter_frames(p): | |
| buf = bytearray(4*2**20) | |
| with open(p, 'rb') as fh: | |
| for chunk in iter_readinto(fh, buf): | |
| pb = pa.py_buffer(buf) | |
| arr = pa.Array.from_buffers(pa.uint16(), len(pb) / 2, [None, pb]) | |
| rb = pa.record_batch({ 'data': arr }) | |
| df = pl.from_arrow(rb) | |
| yield df | |
| def get_total_bytepairs(p): | |
| total = pl.DataFrame(schema=pl.Schema({ 'data': pl.UInt16, 'count': pl.UInt64 })) | |
| for s in iter_frames(p): | |
| count = s.to_series().value_counts() | |
| total = pl.concat([total, count]).group_by('data').agg(pl.sum('count')) | |
| return total | |
| def shape_grid(totals: pl.DataFrame): | |
| return (pl.select(data=pl.arange(65536)) | |
| .join(totals, 'data', how='left') | |
| .select('count') | |
| .to_numpy() | |
| .reshape((256, -1))) | |
| def standard_scaler(data): | |
| return (data - np.nanmean(data)) / np.nanstd(data) | |
| def robust_scaler(data): | |
| qs = np.nanpercentile(data, [25, 50, 75]) | |
| iqr = qs[2] - qs[0] | |
| if iqr == 0: | |
| iqr = 1 | |
| return (data - qs[1]) / (1.5 * iqr) | |
| def byteplot( | |
| p: os.PathLike | str, | |
| transform=lambda d: d, | |
| scaler=robust_scaler, | |
| **kwargs, | |
| ) -> matplotlib.image.AxesImage: | |
| if not isinstance(p, os.PathLike): | |
| p = Path(p).expanduser() | |
| tit = subprocess.check_output(['file','-b', p]).decode() | |
| total = get_total_bytepairs(p) | |
| grid = shape_grid(total) | |
| kw = dict(cmap='turbo', vmin=-1, vmax=1) | |
| kw.update(kwargs) | |
| ax = plt.imshow(scaler(transform(grid)), **kw) | |
| if tit != 'data\n': | |
| plt.suptitle(textwrap.fill(tit, 40)) | |
| return ax |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment




















































































































