Skip to content

Instantly share code, notes, and snippets.

# zip_fit/train/train.py
from typing import List, Optional, Dict
from transformers import (
AutoTokenizer,
)
def seed_everything(seed: int = 42):
"""
# tfa.py
import os
import random
from tqdm import tqdm
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
PreTrainedModel,
@brando90
brando90 / multigpu.md
Created February 5, 2025 18:36
train.py

Suhas Kotha Monday at 5:54 PM ive found this code to be a super simple and functioning multi-gpu training script https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/train.py scripts/train.sh calls train.py. the number of gpus is pulled from the number of available gpus, and it uses the fsdp config specified in scripts/config/fsdp_config.json train.py from dataclasses import dataclass, field, asdict from typing import Optional import transformers import os

@brando90
brando90 / method1_replace_0s_1s_with_rounding_err.py
Last active December 23, 2024 03:22
method1_replace_0s_1s_with_rounding_err.py
import numpy as np
from scipy.stats import beta
import matplotlib.pyplot as plt
from scipy.optimize import minimize
# Generate synthetic data with latent Beta distribution
np.random.seed(42)
alpha_true, beta_true = 2, 5 # True Beta distribution parameters
n_samples = 1000
resolution = 1e-4
@brando90
brando90 / multiple_gpus_1_file.py
Created December 16, 2024 21:44
multiple_gpus_1_file.py
def main():
import os
import sys
import socket
print(sys.executable)
if socket.gethostname() == 'skampere1':
print('Hardcoding the path since we are in skampere')
sys.path = ['', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python311.zip', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/lib-dynload', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/site-packages', '/afs/cs.stanford.edu/u/brando9/beyond-scale-2-alignment-coeff/py_src', '/afs/cs.stanford.edu/u/brando9/ultimate-utils/py_src']
print(f'{sys.path=}')
@brando90
brando90 / training_guidelines.md
Created November 29, 2024 21:59
nothing bellow 16 bits for training

Training Guidelines Summary

  • SFT: Use bf16 or fp32 for training; avoid 8bit. For evaluation, fp16, bf16, or fp32 is fine. Follow established scripts for reliability.
  • Unsloth: Train LoRA with fp16, bf16, or fp32. Avoid 8bit or lower unless validated through replication of original experiments. No QLoRA unless core setups are stable and everything before this has worked.
import torch
# Create two matrices on the GPU
matrix_a = torch.rand((1000, 1000), device='cuda')
matrix_b = torch.rand((1000, 1000), device='cuda')
# Perform matrix sum
result = matrix_a + matrix_b
# Verify and print device of the result
@brando90
brando90 / gemma_tok_how_does_mask_look_if_eos_pad_both_present_in_tok.py
Last active November 21, 2024 04:43
gemma 2 2b tokenizer properly adding eos padding and masking
# ref: https://chatgpt.com/c/673e8232-0a18-8001-9fb5-ed1262bf267f
# ref: https://gist.github.com/brando90/4cd94ad3730218dca75dba779f770c9d
from transformers import AutoTokenizer
def analyze_tokenizer_output(model_name, text, pad_token="<pad>", eos_token="</s>", max_length=20):
"""
Analyzes the tokenizer output, including the attention mask and labels,
when eos_token and pad_token are present.
"""
# Load the tokenizer
@brando90
brando90 / teacher_forced_accuracy.py
Created November 21, 2024 00:29
teacher_forced_accuracy.py
#ref: https://chatgpt.com/share/673e7ef2-23cc-8001-b682-3ff4b66c797a
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
def compute_tfa(model, tokenizer, input_texts):
"""
Computes Teacher-Forced Accuracy (TFA), rewarding the model for correctly predicting
the first EOS token while ignoring predictions for padding tokens.
Parameters:
{
    "source": "...",
    "id": "...",
    "attributes": {
      "compression_ratio_zstd": 0.7
    }
}