Brando Miranda brando90

Data Selection for Language Models via Compression

This repository hosts the ZIP-FIT data selection framework, designed to effectively and efficiently select relevant training data for language models from any data source based on a specified target dataset.

ZIP-FIT is optimized for:

Rapid, large-scale data selection from extensive raw text datasets.
Identifying data that closely aligns with the distribution of a given target dataset (e.g., domain-specific data, HumanEval, etc.).

Suhas Kotha Monday at 5:54 PM ive found this code to be a super simple and functioning multi-gpu training script https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/train.py scripts/train.sh calls train.py. the number of gpus is pulled from the number of available gpus, and it uses the fsdp config specified in scripts/config/fsdp_config.json train.py from dataclasses import dataclass, field, asdict from typing import Optional import transformers import os

Training Guidelines Summary

SFT: Use bf16 or fp32 for training; avoid 8bit. For evaluation, fp16, bf16, or fp32 is fine. Follow established scripts for reliability.
Unsloth: Train LoRA with fp16, bf16, or fp32. Avoid 8bit or lower unless validated through replication of original experiments. No QLoRA unless core setups are stable and everything before this has worked.

	# zip_fit/train/train.py

	from typing import List, Optional, Dict

	from transformers import (
	AutoTokenizer,
	)

	def seed_everything(seed: int = 42):
	"""

	# tfa.py

	import os
	import random
	from tqdm import tqdm
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	PreTrainedModel,

	import numpy as np
	from scipy.stats import beta
	import matplotlib.pyplot as plt
	from scipy.optimize import minimize

	# Generate synthetic data with latent Beta distribution
	np.random.seed(42)
	alpha_true, beta_true = 2, 5 # True Beta distribution parameters
	n_samples = 1000
	resolution = 1e-4

	def main():
	import os
	import sys
	import socket
	print(sys.executable)
	if socket.gethostname() == 'skampere1':
	print('Hardcoding the path since we are in skampere')
	sys.path = ['', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python311.zip', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/lib-dynload', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/site-packages', '/afs/cs.stanford.edu/u/brando9/beyond-scale-2-alignment-coeff/py_src', '/afs/cs.stanford.edu/u/brando9/ultimate-utils/py_src']
	print(f'{sys.path=}')

	import torch

	# Create two matrices on the GPU
	matrix_a = torch.rand((1000, 1000), device='cuda')
	matrix_b = torch.rand((1000, 1000), device='cuda')

	# Perform matrix sum
	result = matrix_a + matrix_b

	# Verify and print device of the result

	# ref: https://chatgpt.com/c/673e8232-0a18-8001-9fb5-ed1262bf267f
	# ref: https://gist.github.com/brando90/4cd94ad3730218dca75dba779f770c9d
	from transformers import AutoTokenizer

	def analyze_tokenizer_output(model_name, text, pad_token="<pad>", eos_token="</s>", max_length=20):
	"""
	Analyzes the tokenizer output, including the attention mask and labels,
	when eos_token and pad_token are present.
	"""
	# Load the tokenizer

	#ref: https://chatgpt.com/share/673e7ef2-23cc-8001-b682-3ff4b66c797a
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	def compute_tfa(model, tokenizer, input_texts):
	"""
	Computes Teacher-Forced Accuracy (TFA), rewarding the model for correctly predicting
	the first EOS token while ignoring predictions for padding tokens.

	Parameters: