Skip to content

Instantly share code, notes, and snippets.

View ashvinnihalani's full-sized avatar

Ashvin Nihalani ashvinnihalani

View GitHub Profile
------------------------ arguments ------------------------
account_for_embedding_in_pipeline_split ......... False
account_for_loss_in_pipeline_split .............. False
accumulate_allreduce_grads_in_fp32 .............. True # Double check Good
adam_beta1 ...................................... 0.9
adam_beta2 ...................................... 0.95
adam_eps ........................................ 1e-08
add_bias_linear ................................. False
add_position_embedding .......................... True # Double Check
add_qkv_bias .................................... False
from kubernetes import client, config
def terminate_vcjob(vcjob: dict, namespace):
vcjob_name = vcjob["metadata"]["name"]
api_instance = client.CustomObjectsApi()
try:
vcjob["status"]["state"]["phase"] = "Aborted"
api_instance.replace_namespaced_custom_object_status(
group="batch.volcano.sh",
min_lr=config.min_lr,
decoupled_lr=config.decoupled_lr,
decoupled_min_lr=config.decoupled_min_lr,
)
param_groups = list(filter(filter_fn, param_groups))
param_groups.sort(key=lambda g: g.get("wd_mult", 1.0), reverse=True)
buffers = {}
for model_chunk_idx, model_chunk in enumerate(model_chunks):
if hasattr(model_chunk, buffer_name):
@ashvinnihalani
ashvinnihalani / lovely_tensors.py
Created May 29, 2025 01:51
Checkpoint Statistcs
"""
Launch like:
OMP_NUM_THREADS=1 LOGLEVEL=WARNING torchrun --rdzv-backend static --master-addr ${VC_TRAIN_0_HOSTS:-$(hostname --fqdn)} \
--node-rank $NODE_INDEX --nnodes $NNODES --nproc-per-node 8 /shared/workspace/rynli/check_nans_ckpt.py
"""
import os
{'Overall-Art and Design': {'num': 120, 'mmmu_acc': 0.25}, 'Art': {'num': 30, 'mmmu_acc': 0.333}, 'Art_Theory': {'num': 30, 'mmmu_acc': 0.233}, 'Design': {'num': 30, 'mmmu_acc': 0.167}, 'Music': {'num': 30, 'mmmu_acc': 0.267}, 'Overall-Business': {'num': 150, 'mmmu_acc': 0.233}, 'Accounting': {'num': 30, 'mmmu_acc': 0.4}, 'Economics': {'num': 30, 'mmmu_acc': 0.233}, 'Finance': {'num': 30, 'mmmu_acc': 0.167}, 'Manage': {'num': 30, 'mmmu_acc': 0.133}, 'Marketing': {'num': 30, 'mmmu_acc': 0.233}, 'Overall-Science': {'num': 150, 'mmmu_acc': 0.193}, 'Biology': {'num': 30, 'mmmu_acc': 0.267}, 'Chemistry': {'num': 30, 'mmmu_acc': 0.1}, 'Geography': {'num': 30, 'mmmu_acc': 0.2}, 'Math': {'num': 30, 'mmmu_acc': 0.233}, 'Physics': {'num': 30, 'mmmu_acc': 0.167}, 'Overall-Health and Medicine': {'num': 150, 'mmmu_acc': 0.227}, 'Basic_Medical_Science': {'num': 30, 'mmmu_acc': 0.3}, 'Clinical_Medicine': {'num': 30, 'mmmu_acc': 0.2}, 'Diagnostics_and_Laboratory_Medicine': {'num': 30, 'mmmu_acc': 0.133}, 'Pharmacy': {'num':
@ashvinnihalani
ashvinnihalani / Readme.md
Last active November 19, 2021 21:27
Podman Rootless GPU Accelerated Training