Suhas Kotha Monday at 5:54 PM ive found this code to be a super simple and functioning multi-gpu training script https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/train.py scripts/train.sh calls train.py. the number of gpus is pulled from the number of available gpus, and it uses the fsdp config specified in scripts/config/fsdp_config.json train.py from dataclasses import dataclass, field, asdict from typing import Optional import transformers import os
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# zip_fit/train/train.py | |
from typing import List, Optional, Dict | |
from transformers import ( | |
AutoTokenizer, | |
) | |
def seed_everything(seed: int = 42): | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tfa.py | |
import os | |
import random | |
from tqdm import tqdm | |
import torch | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
PreTrainedModel, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import beta | |
import matplotlib.pyplot as plt | |
from scipy.optimize import minimize | |
# Generate synthetic data with latent Beta distribution | |
np.random.seed(42) | |
alpha_true, beta_true = 2, 5 # True Beta distribution parameters | |
n_samples = 1000 | |
resolution = 1e-4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
import os | |
import sys | |
import socket | |
print(sys.executable) | |
if socket.gethostname() == 'skampere1': | |
print('Hardcoding the path since we are in skampere') | |
sys.path = ['', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python311.zip', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/lib-dynload', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/site-packages', '/afs/cs.stanford.edu/u/brando9/beyond-scale-2-alignment-coeff/py_src', '/afs/cs.stanford.edu/u/brando9/ultimate-utils/py_src'] | |
print(f'{sys.path=}') |
- SFT: Use bf16 or fp32 for training; avoid 8bit. For evaluation, fp16, bf16, or fp32 is fine. Follow established scripts for reliability.
- Unsloth: Train LoRA with fp16, bf16, or fp32. Avoid 8bit or lower unless validated through replication of original experiments. No QLoRA unless core setups are stable and everything before this has worked.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
# Create two matrices on the GPU | |
matrix_a = torch.rand((1000, 1000), device='cuda') | |
matrix_b = torch.rand((1000, 1000), device='cuda') | |
# Perform matrix sum | |
result = matrix_a + matrix_b | |
# Verify and print device of the result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ref: https://chatgpt.com/c/673e8232-0a18-8001-9fb5-ed1262bf267f | |
# ref: https://gist.github.com/brando90/4cd94ad3730218dca75dba779f770c9d | |
from transformers import AutoTokenizer | |
def analyze_tokenizer_output(model_name, text, pad_token="<pad>", eos_token="</s>", max_length=20): | |
""" | |
Analyzes the tokenizer output, including the attention mask and labels, | |
when eos_token and pad_token are present. | |
""" | |
# Load the tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ref: https://chatgpt.com/share/673e7ef2-23cc-8001-b682-3ff4b66c797a | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
def compute_tfa(model, tokenizer, input_texts): | |
""" | |
Computes Teacher-Forced Accuracy (TFA), rewarding the model for correctly predicting | |
the first EOS token while ignoring predictions for padding tokens. | |
Parameters: |
{
"source": "...",
"id": "...",
"attributes": {
"compression_ratio_zstd": 0.7
}
}
NewerOlder