Skip to content

Instantly share code, notes, and snippets.

View dhbrojas's full-sized avatar
🐉
Code LLMs @ Zhipu.AI, THU

罗杰斯 dhbrojas

🐉
Code LLMs @ Zhipu.AI, THU
View GitHub Profile
@dhbrojas
dhbrojas / config.json
Last active July 19, 2025 16:18
Minitron, LLM Training
{
"architectures": ["Qwen3ForCausalLM"],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 1024,
"initializer_range": 0.02,
@dhbrojas
dhbrojas / acc.py
Created July 21, 2025 11:05
Linear Gradient Accumulation Schedule
import torch
class GradientAccumulationSchedule:
"""
A schedule that linearly increases the number of gradient accumulation
steps throughout training to converge faster.
"""
def __init__(self, *, min: int, max: int, steps: int, factor: int | None = None):
import torch
from tqdm import tqdm
from torch.nn import Module
from torch.nn.functional import cross_entropy
from transformers import (
AutoConfig,
AutoModelForCausalLM,
)
BATCH = 16
from typing import Callable, Protocol
import torch
from torch import Tensor
from torch.nn import Linear, Module
from torch.nn.functional import silu
def compute_frequencies(
*,
@dhbrojas
dhbrojas / wsd.py
Created July 28, 2025 12:24
Warmup Stable Decay LR
import math
def warmup_stable_decay(*, W: int, S: int, D: int, min_lr_scale_factor: float = 0.1):
"""
Returns a lambda function for PyTorch's LambdaLR scheduler implementing the
WSD learning rate schedule.
Parameters:
- W: The last step of the warmup phase.