Skip to content

Instantly share code, notes, and snippets.

@shunting314
Created November 24, 2025 07:20
Show Gist options
  • Select an option

  • Save shunting314/1fb3d31756ddbbad91ebfcb7c075f4dd to your computer and use it in GitHub Desktop.

Select an option

Save shunting314/1fb3d31756ddbbad91ebfcb7c075f4dd to your computer and use it in GitHub Desktop.
class <lambda>(torch.nn.Module):
def forward(self, arg0_1: "Sym(s36)", arg1_1: "bf16[s36, 128256][128256, 1]cuda:0", arg2_1: "f32[s36][1]cuda:0"):
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:87 in forward, code: logits = logits.to(torch.float32)
convert_element_type: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.prims.convert_element_type.default(arg1_1, torch.float32); arg1_1 = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:138 in apply_temperature, code: return logits.div_(temp.unsqueeze(dim=1))
unsqueeze: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.unsqueeze.default(arg2_1, 1); arg2_1 = None
# No stacktrace found for following nodes
ge_scalar: "b8[s36, 1][1, 1]cuda:0" = torch.ops.aten.ge.Scalar(unsqueeze, 0)
scalar_tensor_default: "f32[][]cuda:0" = torch.ops.aten.scalar_tensor.default(1, dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False)
neg_default: "f32[][]cuda:0" = torch.ops.aten.neg.default(scalar_tensor_default)
where_self: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.where.self(ge_scalar, scalar_tensor_default, neg_default); ge_scalar = scalar_tensor_default = neg_default = None
mul_tensor: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type, where_self); convert_element_type = None
amax_default: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.amax.default(mul_tensor, [-1], True)
sub_tensor: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.sub.Tensor(mul_tensor, amax_default); mul_tensor = amax_default = None
mul_tensor_1: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.mul.Tensor(where_self, unsqueeze); where_self = unsqueeze = None
div_tensor: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.div.Tensor(sub_tensor, mul_tensor_1); sub_tensor = mul_tensor_1 = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/ops/topk_topp_sampler.py:100 in forward_native, code: probs = logits.softmax(dim=-1, dtype=torch.float32)
exp: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.exp.default(div_tensor); div_tensor = None
sum_1: "f32[s36, 1][1, 1]cuda:0" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
div_1: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
# No stacktrace found for following nodes
inductor_seeds_default: "i64[1][1]cuda:0" = torch.ops.prims.inductor_seeds.default(1, device(type='cuda', index=0))
# File: /home/shunting/ws/vllm/vllm/v1/sample/ops/topk_topp_sampler.py:312 in random_sample, code: q.exponential_()
inductor_lookup_seed_default: "i64[][]cuda:0" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0); inductor_seeds_default = None
inductor_random_default: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.prims.inductor_random.default([arg0_1, 128256], inductor_lookup_seed_default, 'rand'); arg0_1 = inductor_lookup_seed_default = None
ge_10: "b8[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.ge.Scalar(inductor_random_default, 0.9999999403953552)
full_default: "f32[][]cuda:0" = torch.ops.aten.full.default([], -5.960464477539063e-08, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
log: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.log.default(inductor_random_default); inductor_random_default = None
where: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.where.self(ge_10, full_default, log); ge_10 = full_default = log = None
mul_53: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.mul.Tensor(where, -1.0); where = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/ops/topk_topp_sampler.py:318 in random_sample, code: return probs.div_(q).argmax(dim=-1).view(-1)
div_2: "f32[s36, 128256][128256, 1]cuda:0" = torch.ops.aten.div.Tensor(div_1, mul_53); div_1 = mul_53 = None
argmax: "i64[s36][1]cuda:0" = torch.ops.aten.argmax.default(div_2, -1); div_2 = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:116 in forward, code: sampled = sampled.to(torch.int32)
convert_element_type_1: "i32[s36][1]cuda:0" = torch.ops.prims.convert_element_type.default(argmax, torch.int32); argmax = None
# File: /home/shunting/ws/vllm/vllm/v1/sample/sampler.py:123 in forward, code: sampled_token_ids=sampled.unsqueeze(-1),
unsqueeze_1: "i32[s36, 1][1, 1]cuda:0" = torch.ops.aten.unsqueeze.default(convert_element_type_1, -1); convert_element_type_1 = None
return (unsqueeze_1,)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment