Skip to content

Instantly share code, notes, and snippets.

@liangfu
liangfu / Makefile
Created March 18, 2025 18:44
Rasterization (aka Rendering triangles in framebuffer) in C
# Compiler settings
CC = gcc
CFLAGS = -Wall -Wextra -O2
LDFLAGS = -lm
# Project files
SRC = rasterizer.c
OBJ = $(SRC:.c=.o)
TARGET = rasterizer
@liangfu
liangfu / test_mixed_eager_aot.py
Created March 13, 2025 16:34
Evaluate consistency when mixing eager execution with torch.compile()
import torch
import os
import torch_xla.core.xla_model as xm
def write_to_kv_cache(
key: torch.Tensor,
value: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
slot_mapping: torch.Tensor,
@liangfu
liangfu / test_split_neuronx.py
Created March 11, 2025 21:26
Evaluate torch.split and slice operator support on openxla backend (with torch_neuronx)
import os
import pytest
import torch
import torch_neuronx
import torch_xla.core.xla_model as xm
@pytest.mark.parametrize("batch_size,seq_len,q_size,kv_size,use_torch_compile,disable_functionalization", [
(2, 128, 32, 32, False, True),
(2, 128, 32, 32, True, True),
(2, 128, 32, 32, False, False),
@liangfu
liangfu / test_split.py
Last active March 10, 2025 23:45
Evaluate torch.split and slice operator support on openxla backend
import pytest
import torch
import torch_xla.core.xla_model as xm
@pytest.mark.parametrize("batch_size,seq_len,q_size,kv_size", [
(2, 128, 32, 32),
(4, 256, 64, 64),
])
def test_split_consistency(batch_size, seq_len, q_size, kv_size):
# Get XLA device
@liangfu
liangfu / test_sr.py
Created January 17, 2025 23:52
Evaluate stochastic rounding
import os
import time
import torch
import torch_xla.core.xla_model as xm
N = 16
def main():
# os.environ["XLA_USE_BF16"] = "1"
os.environ["NEURON_RT_STOCHASTIC_ROUNDING_EN"] = "1"
import torch
import os
import depyf
import torch_xla.core.xla_model as xm
os.environ["NEURON_CC_FLAGS"]= " --model-type=transformer -O1 "
os.environ["NEURON_COMPILE_CACHE_URL"] = os.path.join(os.getcwd(), "_compile_cache")
@torch.compiler.allow_in_graph
def write_to_kv_cache(
@liangfu
liangfu / benchmark_xla_scatter.py
Created January 9, 2025 04:34
Benchmark xla scatter with torch-xla
import time
import torch
import torch_xla.core.xla_model as xm
N = 128
n_iters = 100
def main():
device = xm.xla_device()
src = torch.arange(1, 2*N+1).reshape((2, N)).to(device=device)
@liangfu
liangfu / depyf_openxla.py
Created December 2, 2024 23:12
Demonstrate the feasibility of combining depyf with openxla backend
import torch
import torch_xla.core.xla_model as xm
@torch.compile(backend="openxla")
def toy_example(a, b):
x = a / (torch.abs(a) + 1)
if b.sum() < 0:
b = b * -1
return x * b
from typing import Any, Dict, List, Optional, Tuple, Type
import torch
import torch_xla.core.xla_model as xm
import torch_xla.experimental.custom_kernel # Required to register custom ops.
class PallasAttentionBackend:
@torch.compile(backend="openxla")
@staticmethod
def copy_blocks(
@liangfu
liangfu / bedrock.py
Created October 14, 2024 19:17
Ask Bedrock long questions with Python script
import boto3, json
question = """
How are you today?
"""
def main():
session = boto3.Session()
bedrock = session.client(service_name='bedrock-runtime', region_name="us-west-2")