💻

Work

Keren Zhou Jokeren

💻

Work

530 followers · 25 following

George Mason University
Fairfax
11:20 (UTC -12:00)
jokeren.tech

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

Jokeren / overhead.py

Last active June 7, 2024 15:01

Proton overhead

	import torch
	import time
	import sys


	def run(nelems, iters):
	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)

Jokeren / new.mlir

Created August 9, 2024 02:39

mlirs

	#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
	#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
	#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}>
	#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
	module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} {
	tt.func public @hoist_convert_above_extf_and_remat(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.pt

Jokeren / wrong.llir

Created August 17, 2024 16:22

AMD vec problem

	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"
	target triple = "amdgcn-amd-amdhsa"

	@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16

	; Function Attrs: mustprogress nofree norecurse nounwind willreturn
	define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 {
	%3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
	%4 = shl i32 %3, 2, !dbg !8

OlderNewer