HDCharles · March 20, 2025 18:03 · HDCharles · Mar 20, 2025
diff --git a/small_moe_repro.py b/small_moe_repro.py
 import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import functional as F
 from dataclasses import dataclass
 torch.manual_seed(0)



 # T tokens
 # E experts
 # D dim
 # I intermediate dim
 # A activated experts
 # T'(e) tokens for expert e

 class MOEFeedForward(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.gate = nn.Linear(4, 8, bias=False)
        self.cond_ffn = ConditionalFeedForward()
        self.dim = 4
        self.num_activated_experts = 2
    def forward(self, x: Tensor) -> Tensor:
        batch_size = x.shape[0]
        x = x.view(-1, self.dim) # x: [T, D]
        scores = self.gate(x) # [T, E]
        expert_weights = F.softmax(scores, dim=-1)
        expert_weights, expert_indices = torch.topk(expert_weights, self.num_activated_experts, dim=-1) # [T, A], [T, A]
        expert_weights /= expert_weights.sum(dim=-1, keepdim=True) # [T, A]
        out = self.cond_ffn(x, expert_indices, expert_weights, self.num_activated_experts)
        return out.reshape(batch_size, -1, self.dim)

 class ConditionalFeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.w1 = nn.Parameter(torch.randn((8, 16, 4), device="cuda", dtype=torch.bfloat16)) # E, I, D
        self.w2 = nn.Parameter(torch.randn((8, 4, 16), device="cuda", dtype=torch.bfloat16)) # E, D, I
        self.w3 = nn.Parameter(torch.randn((8, 16, 4), device="cuda", dtype=torch.bfloat16)) # E, I, D
        self.num_experts = 8
        algorithm = "forloop"
    def forward(
        self, x: Tensor,        # T, D
        expert_indices: Tensor, # T, A
        expert_weights: Tensor,  # T, A
        num_activated_experts: int,
        ) -> Tensor:

        if x.shape[0] == 1:
            outs = []
            expert_indices=expert_indices.squeeze()
            w1 = self.w1[expert_indices]
            w2 = self.w2[expert_indices]
            w3 = self.w3[expert_indices]

            for index in range(num_activated_experts):
                cur_out = F.linear( F.silu(F.linear(x, w1[index])) * F.linear(x, w3[index]), w2[index])
                outs.append(cur_out)
            mixed_outs = torch.cat(outs, dim=0)
            final_out = (mixed_outs * expert_weights.view(-1,1)).sum(dim=0).unsqueeze(-1)
            return final_out
        else:
            num_tokens, dim = x.shape
            num_token_activations = num_tokens * num_activated_experts
            ordered_token_activations = expert_indices.view(-1).argsort(stable=True)
            ordered_tokens =ordered_token_activations.div(num_activated_experts).floor().to(torch.int64)
            
            num_tokens_per_expert = torch.histc(expert_indices, bins=self.num_experts+1, min=-1, max=self.num_experts)
            cum_tokens_per_expert = num_tokens_per_expert.cumsum(0)
            outs = []
            expert_list = [x for x in range(self.num_experts)]
            
            token_indices_per_expert = [ordered_tokens[cum_tokens_per_expert[expert]:cum_tokens_per_expert[expert+1]] for expert in expert_list]
            inputs = [x[indices] for indices in token_indices_per_expert]

            for cur_x, expert in zip(inputs,expert_list):
                w1=self.w1[expert] # I, D
                w2=self.w2[expert] # D, I
                w3=self.w3[expert] # I, D

                cur_out = F.linear( F.silu(F.linear(cur_x, w1)) * F.linear(cur_x, w3), w2) # T', D
                outs.append(cur_out)

            ordered_outs = torch.cat(outs, dim=0)
            weight_order = expert_weights.view(-1,1)[ordered_token_activations].view(-1,1)
            weighted_ordered_outs = ordered_outs*weight_order
            
            final_out = torch.zeros_like(x)
            final_out = final_out.scatter_add(dim=0, index=ordered_tokens.unsqueeze(-1).expand(num_token_activations,dim), src=weighted_ordered_outs)
            # final_out = final_out.scatter_add(dim=0, index=ordered_tokens.unsqueeze(-1).expand(2,4), src=weighted_ordered_outs)
            return final_out


 moe = MOEFeedForward().to("cuda").to(torch.float64)

 input1 = torch.randn(1, 6, 4).to("cuda").to(torch.float64)
 input2 = torch.randn(1, 6, 4).to("cuda").to(torch.float64)

 with torch.no_grad():
    out1 = moe(input1)
    print(out1.sum())
    out2 = moe(input2)
    print(out2.sum())

    moe = moe.to(torch.bfloat16)
    input1 = input1.to(torch.bfloat16)
    input2 = input2.to(torch.bfloat16)

    out1 = moe(input1)
    print(out1.sum())
    out2 = moe(input2)
    print(out2.sum())

    moe_c = torch.compile(moe, mode="reduce-overhead") # working
    # moe_c = torch.compile(moe, mode="reduce-overhead", fullgraph=True) #this fails on token shuffle part

    moe_c(input1)
    moe_c(input2)
    out1c = moe_c(input1)
    print(out1c.sum())
    out2c = moe_c(input2)
    print(out2c.sum())
	import torch
	import torch.nn as nn
	from torch import Tensor
	from torch.nn import functional as F
	from dataclasses import dataclass
	torch.manual_seed(0)



	# T tokens
	# E experts
	# D dim
	# I intermediate dim
	# A activated experts
	# T'(e) tokens for expert e

	class MOEFeedForward(nn.Module):
	def __init__(self) -> None:
	super().__init__()
	self.gate = nn.Linear(4, 8, bias=False)
	self.cond_ffn = ConditionalFeedForward()
	self.dim = 4
	self.num_activated_experts = 2
	def forward(self, x: Tensor) -> Tensor:
	batch_size = x.shape[0]
	x = x.view(-1, self.dim) # x: [T, D]
	scores = self.gate(x) # [T, E]
	expert_weights = F.softmax(scores, dim=-1)
	expert_weights, expert_indices = torch.topk(expert_weights, self.num_activated_experts, dim=-1) # [T, A], [T, A]
	expert_weights /= expert_weights.sum(dim=-1, keepdim=True) # [T, A]
	out = self.cond_ffn(x, expert_indices, expert_weights, self.num_activated_experts)
	return out.reshape(batch_size, -1, self.dim)

	class ConditionalFeedForward(nn.Module):
	def __init__(self):
	super().__init__()
	self.w1 = nn.Parameter(torch.randn((8, 16, 4), device="cuda", dtype=torch.bfloat16)) # E, I, D
	self.w2 = nn.Parameter(torch.randn((8, 4, 16), device="cuda", dtype=torch.bfloat16)) # E, D, I
	self.w3 = nn.Parameter(torch.randn((8, 16, 4), device="cuda", dtype=torch.bfloat16)) # E, I, D
	self.num_experts = 8
	algorithm = "forloop"
	def forward(
	self, x: Tensor, # T, D
	expert_indices: Tensor, # T, A
	expert_weights: Tensor, # T, A
	num_activated_experts: int,
	) -> Tensor:

	if x.shape[0] == 1:
	outs = []
	expert_indices=expert_indices.squeeze()
	w1 = self.w1[expert_indices]
	w2 = self.w2[expert_indices]
	w3 = self.w3[expert_indices]

	for index in range(num_activated_experts):
	cur_out = F.linear( F.silu(F.linear(x, w1[index])) * F.linear(x, w3[index]), w2[index])
	outs.append(cur_out)
	mixed_outs = torch.cat(outs, dim=0)
	final_out = (mixed_outs * expert_weights.view(-1,1)).sum(dim=0).unsqueeze(-1)
	return final_out
	else:
	num_tokens, dim = x.shape
	num_token_activations = num_tokens * num_activated_experts
	ordered_token_activations = expert_indices.view(-1).argsort(stable=True)
	ordered_tokens =ordered_token_activations.div(num_activated_experts).floor().to(torch.int64)

	num_tokens_per_expert = torch.histc(expert_indices, bins=self.num_experts+1, min=-1, max=self.num_experts)
	cum_tokens_per_expert = num_tokens_per_expert.cumsum(0)
	outs = []
	expert_list = [x for x in range(self.num_experts)]

	token_indices_per_expert = [ordered_tokens[cum_tokens_per_expert[expert]:cum_tokens_per_expert[expert+1]] for expert in expert_list]
	inputs = [x[indices] for indices in token_indices_per_expert]

	for cur_x, expert in zip(inputs,expert_list):
	w1=self.w1[expert] # I, D
	w2=self.w2[expert] # D, I
	w3=self.w3[expert] # I, D

	cur_out = F.linear( F.silu(F.linear(cur_x, w1)) * F.linear(cur_x, w3), w2) # T', D
	outs.append(cur_out)

	ordered_outs = torch.cat(outs, dim=0)
	weight_order = expert_weights.view(-1,1)[ordered_token_activations].view(-1,1)
	weighted_ordered_outs = ordered_outs*weight_order

	final_out = torch.zeros_like(x)
	final_out = final_out.scatter_add(dim=0, index=ordered_tokens.unsqueeze(-1).expand(num_token_activations,dim), src=weighted_ordered_outs)
	# final_out = final_out.scatter_add(dim=0, index=ordered_tokens.unsqueeze(-1).expand(2,4), src=weighted_ordered_outs)
	return final_out


	moe = MOEFeedForward().to("cuda").to(torch.float64)

	input1 = torch.randn(1, 6, 4).to("cuda").to(torch.float64)
	input2 = torch.randn(1, 6, 4).to("cuda").to(torch.float64)

	with torch.no_grad():
	out1 = moe(input1)
	print(out1.sum())
	out2 = moe(input2)
	print(out2.sum())

	moe = moe.to(torch.bfloat16)
	input1 = input1.to(torch.bfloat16)
	input2 = input2.to(torch.bfloat16)

	out1 = moe(input1)
	print(out1.sum())
	out2 = moe(input2)
	print(out2.sum())

	moe_c = torch.compile(moe, mode="reduce-overhead") # working
	# moe_c = torch.compile(moe, mode="reduce-overhead", fullgraph=True) #this fails on token shuffle part

	moe_c(input1)
	moe_c(input2)
	out1c = moe_c(input1)
	print(out1c.sum())
	out2c = moe_c(input2)
	print(out2c.sum())