Skip to content

Instantly share code, notes, and snippets.

View TeaPoly's full-sized avatar

Lucky Wong TeaPoly

View GitHub Profile
#!/usr/bin/python
# -*- coding: utf-8 -*-
import torch
def round_ste(x):
# STE for gradient
return torch.floor(x + 0.5).detach() + (x - x.detach())
#!/usr/bin/python
# -*- coding: utf-8 -*-
import torch
def absmean_binarize(x, contract_dims, centralize=False, eps=1e-8):
if centralize:
mean = torch.mean(x, dim=contract_dims, keepdim=True)
x = x - mean
@TeaPoly
TeaPoly / deepseek_v3_moe.py
Last active April 8, 2025 10:30
DeepSeek V3 MoE with aux-loss-free and sequence aux loss.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2024 Lucky Wong
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#