dtype | SOTA | 2.2.2+eager | 2.3.0+eager | 2.3.0+compile | trunk + compile |
---|---|---|---|---|---|
bfloat16 (M1) | 111 tokens/sec | 110 tokens/sec | 80 tokens/sec | ||
float32 (M1) | 687 tokens/sec | 165 tokens/sec | 176 tokens/sec | ||
float16 (M1) | 1106 tokens/sec | 50 tokens/sec | 187 tokens/sec | ||
float16 (LinX86) | 40 tokens/sec | 43 tokens/sec | 173 tokens/sec | ||
float32 (LinX86) | 38 tokens/sec | 40 tokens/sec | 179 tokens/sec | ||
bfloat16 (LinX86) | 73 tokens/sec | 78 tokens/sec | 180 tokens/sec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <cuda_runtime.h> | |
__host__ __device__ int return_two() { | |
#if defined(__CUDA_ARCH__) | |
return 3; | |
#else | |
return 2; | |
#endif | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch._inductor.config | |
torch.set_default_device("cuda") | |
import os | |
from triton import autotune, cdiv, Config, heuristics, jit # @manual | |
import triton.language as tl | |
@autotune( | |
configs=[ | |
Config({"BLOCK_M": 32, "BLOCK_N": 32}), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ctypes import cdll, c_char_p, c_uint32 | |
libdyld = cdll.LoadLibrary("libSystem.dylib") | |
libdyld._dyld_image_count.restype = c_uint32 | |
libdyld._dyld_get_image_name.restype = c_char_p | |
libdyld._dyld_get_image_name.argtypes = [c_uint32] | |
before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} | |
import torch | |
after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Metal | |
import MetalPerformanceShadersGraph | |
func calculateExpMetal(device: MTLDevice, ibuf: MTLBuffer, obuf: MTLBuffer, nelem: Int, fastMathEnabled: Bool = false) { | |
let shader_source = """ | |
#include <metal_stdlib> | |
using namespace metal; | |
kernel void do_exp(constant float *input [[buffer(0)]], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Metal | |
let shader_source = """ | |
#include <metal_stdlib> | |
using namespace metal; | |
kernel void nextafter_pred(device float *data [[buffer(0)]], | |
device bool *pred [[buffer(1)]], | |
uint thread_index [[thread_position_in_grid]]) { | |
data[thread_index] = nextafter(float(thread_index) - 8.0, 1e4); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MetalPerformanceShadersGraph | |
let graph = MPSGraph() | |
let x = graph.constant(1, shape: [32, 4096, 40], dataType: .float32) | |
let y = graph.constant(1, shape: [32, 40, 4096], dataType: .float32) | |
let z = graph.matrixMultiplication(primary: x, secondary: y, name: nil) | |
let device = MTLCreateSystemDefaultDevice()! | |
let buf = device.makeBuffer(length: 16384)! | |
let td = MPSGraphTensorData(buf, shape: [64, 64], dataType: .int32) | |
let cmdBuf = MPSCommandBuffer(from: device.makeCommandQueue()!) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Benchmark relative performance of torch.mm and torch.bmm with single batch | |
import torch | |
import time | |
def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float: | |
if use_kineto: | |
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p: | |
fn(*args) | |
return sum([e.cuda_time for e in p.key_averages()]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import triton | |
import triton.language as tl | |
@triton.jit | |
def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr): | |
xnumel = 10 | |
xoffset = tl.program_id(0) * XBLOCK | |
xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
xmask = xindex < xnumel | |
x0 = xindex |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn.functional as F | |
def to_float8(x, dtype=torch.float8_e4m3fn): | |
finfo = torch.finfo(dtype) | |
# Calculate the scale as dtype max divided by absmax | |
scale = finfo.max / x.abs().max().clamp(min=1e-12) | |
# scale and clamp the tensor to bring it to | |
# the representative range of float8 data type | |
# (as default cast is unsaturated) |