Skip to content

Instantly share code, notes, and snippets.

@rygorous
rygorous / result
Last active February 24, 2026 03:16
Generate all 3-input ternary logic ops on bits via ARM instructions. (SHA3 ext ops optional)
0x00 0 cost=10
0x01 BCAX(A, 0xf5, 0x0c) cost=30
0x02 BIC(0x0a, B) cost=20
0x03 EOR(A, 0xf3) cost=20
0x04 BIC(B, 0xfa) cost=20
0x05 EOR(A, 0xf5) cost=20
0x06 BCAX(0x0a, B, A) cost=20
0x07 EOR(A, 0xf7) cost=30
0x08 BCAX(B, B, 0x0a) cost=20
0x09 EOR3(A, 0x0a, 0xf3) cost=30
@rygorous
rygorous / bdd_adders.py
Last active February 16, 2026 00:50
BDD implementation of some very basic circuit verification proving various 64-bit adder architectures equivalent
import functools
from typing import Optional
class BddContext:
def __init__(self):
# 0/1 leaf nodes, it's convenient to just refer to them as 0 and 1
# a node is a (var id, lo_node, hi_node) tuple
self.nodes = []
self.vars = []
self.node_dict = {}
@rygorous
rygorous / more_serious_fft.cpp
Created December 10, 2025 06:11
Core for a 2x unrolled radix-2 (not really radix-4) FFT kernel
// The FFT alg used here was designed to be very FMA-friendly, but because we can't assume FMAs are present on
// all target HW and want consistent results everywhere, we're using FMA-less algorithms for this application.
// Notation throughout this file:
//
// Let z = a + bi. Then conj(z) = a - bi.
//
// We can swap the real and imaginary parts of z to yield s(z) = b + ai ("swap").
// Now because
//
@rygorous
rygorous / gist:9aac91598af0a94ab4693210291e1f94
Created December 10, 2025 05:37
Reference r2 FFT code
static size_t const kMaxN = 2048; // This is the largest straight FFT we support
struct complexf
{
float re;
float im;
complexf() {}
complexf(float r) : re(r), im(0.0f) {}
complexf(float r, float i) : re(r), im(i) {}
@rygorous
rygorous / morton3d_fun.cpp
Created July 4, 2025 05:49
Vectorized 3D Morton encoding for no reason
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
typedef uint32_t uint32;
typedef __m128i Vec128_U32;
// "Insert" two 0 bits after each of the 11 low bits of x
static uint32 Part1By2(uint32 x)
@rygorous
rygorous / gist:6f96cc21292cc704f53ef77e5b4be519
Created May 22, 2025 02:25
Oodle Texture BC7RD "preserve extremes" constraint validation
//===================================================================
// constraint validation for preserve extremes mode
// Cold part of are_endpoitns_permitted: the actual constraint validation
static RADNOINLINE bool are_endpoints_permitted_cold(const BC7BlockState & st, const bc7rd_blockinfo& info)
{
const BC7Flags flags = info.flags;
if (st.mode <= 3)
{
@rygorous
rygorous / gist:52cc2a23a73813d645581046dced27fd
Created May 22, 2025 00:15
Oodle Texture "preserve extremes" rules
{
BC7Flags flags = in_flags;
// Preserve extremes mode.
//
// This mode preserves values of 0 and 255 in the alpha channel exactly. In general,
// we can do this in any one channel, but restricting this to alpha makes the interface
// simpler, is consistent with what we do for BC3, and doesn't seem like a signfiicant
// limitation for the user.
//
uint EvenBitMask = 0x55555555u;
uint HighIndexBit = PackedIndices >> 1;
float NumWeight1 = float(countbits(EvenBitMask & ~HighIndexBit & PackedIndices));
float NumWeight2 = float(countbits(EvenBitMask & HighIndexBit & ~PackedIndices));
float NumWeight3 = float(countbits(EvenBitMask & HighIndexBit & PackedIndices));
// with NV LOP3: 1 shift, 3 LOP3, 3 pop count, 3 int->float = 10 insns total for 16 pixels
// without: 1 shift, 2 NOT, 5 AND, 3 pop count, 3 int->float = 14 insns total for 16 pixels
@rygorous
rygorous / rr_dds.h
Created March 7, 2025 01:45
rr_dds loader/writer
//===================================================
// Oodle2 DDS Tool
// (C) Copyright 1994-2022 Epic Games Tools LLC
//===================================================
#ifndef RR_DDS_INCLUDED
#define RR_DDS_INCLUDED
#include <stdint.h>
#include <stddef.h>
@rygorous
rygorous / main.rs
Created March 2, 2025 04:42
Base64 fixed point test
use bit_set::BitSet;
use std::mem;
// Vanilla RFC 4648
const ALPHABET: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
// URL-safe RFC 4648
//const ALPHABET: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
const COUNT: usize = 1usize << 24; // 3 bytes worth suffices for this test
fn lookup(index: u32) -> u32 {