Skip to content

Instantly share code, notes, and snippets.

@brando90
Created February 7, 2025 15:35
Show Gist options
  • Save brando90/efebf7a87a4e7cd50e0433a52998e731 to your computer and use it in GitHub Desktop.
Save brando90/efebf7a87a4e7cd50e0433a52998e731 to your computer and use it in GitHub Desktop.
train.py
# zip_fit/train/train.py
from typing import List, Optional, Dict
from transformers import (
AutoTokenizer,
)
def seed_everything(seed: int = 42):
"""
Seed Python, NumPy, and PyTorch for reproducibility.
"""
import random
import numpy as np
from transformers import set_seed as hf_set_seed
import torch
print(f"Setting random seed = {seed}")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available():
hf_set_seed(seed)
else:
print("Warning: Transformers is only fully deterministic on GPU")
try:
from vllm import set_seed as vllm_set_seed
vllm_set_seed(seed)
except ImportError:
print("vLLM not installed or vllm set seed has a bug, skipping vLLM seed setting.")
def tokenize_and_group_texts_via_blocks(
examples: Dict[str, List[str]], # since Batched=True gives a list of strings, note: a block size will be 1 sequences, a concat of tokenized rows from the data set!
tokenizer: AutoTokenizer,
block_size: int,
) -> Dict[str, "torch.Tensor"]:
"""
Tokenizes a batch of raw text examples and groups the tokens into fixed-size blocks.
This function is designed for use with Hugging Face datasets (with batched=True). It processes
a batch of examples (where each example is a string under the 'text' key) by performing the following steps:
1. **Retrieve Special Token IDs:**
It retrieves the beginning-of-sequence (BOS) token ID and the end-of-sequence (EOS) token ID from the tokenizer.
- The BOS token ID is obtained from `tokenizer.bos_token_id` (if available).
- The EOS token ID is obtained from `tokenizer.eos_token_id`.
2. **Per-Text Tokenization with Special Tokens:**
For each text in the input list:
- It prepends the BOS token (if available).
- Tokenizes the text using the tokenizer (without adding any special tokens automatically).
- Appends the EOS token to the token list.
This is done individually for each text so that we can explicitly control the addition of special tokens.
3. **Concatenation:**
All token lists are concatenated into one long list of tokens. This is achieved using the `chain` function
to flatten the list of lists.
4. **Truncation:**
The total token list is truncated so that its length is an exact multiple of `block_size`.
This is done by discarding any tokens at the end that would not complete a full block.
5. **Blocking:**
The truncated token list is split into contiguous blocks, each of length `block_size`.
6. **Label Creation:**
For language modeling tasks, the function creates a 'labels' field that is an exact copy of the 'input_ids'
blocks. Both fields are converted into PyTorch tensors with dtype torch.long.
**Note on the attention mask field:**
The returned dictionary does not include an `attention_mask` field. Including an attention mask with a value of `None`
may cause issues with the Hugging Face Trainer, which expects either a valid tensor or for the key to be omitted entirely.
Args:
examples (Dict[str, List[str]]):
A dictionary representing a batch of examples from the dataset. It must contain a key 'text'
whose value is a list of strings (one per example).
tokenizer (AutoTokenizer):
A tokenizer instance (e.g., from Hugging Face's transformers) that can process a single text string.
The tokenizer should return a dictionary containing at least the key 'input_ids' when called with a text,
and it must have attributes `bos_token_id` (which may be None) and `eos_token_id`.
block_size (int):
The desired number of tokens per block (for example, 1024 or 4096).
Returns:
Dict[str, torch.Tensor]:
A dictionary where:
- 'input_ids' maps to a PyTorch tensor of token blocks, each block being a list of token IDs of length `block_size`
and of dtype torch.long.
- 'labels' maps to a copy of the 'input_ids' tensor.
**Note:** No `attention_mask` is included in the returned dictionary.
"""
from itertools import chain
# -------------------------------------------------------------------------
# Step 1: Retrieve Special Token IDs.
# Retrieve the end-of-sequence (EOS) token ID from the tokenizer.
eos_token_id: int = tokenizer.eos_token_id
# Retrieve the beginning-of-sequence (BOS) token ID from the tokenizer.
# If not used, bos_token_id will be None.
bos_token_id: Optional[int] = tokenizer.bos_token_id
# pad_token: Optional[int] = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else eos_token_id
# -------------------------------------------------------------------------
# Step 2: Per-Text Tokenization with Special Tokens.
# For each text in the input list, perform the following:
# - Prepend the BOS token if it exists.
# - Tokenize the text to get a dictionary; extract the token IDs from 'input_ids'.
# - Append the EOS token to the token list.
# The list comprehension processes each text individually.
token_lists: List[List[int]] = [
# If a BOS token exists, prepend it; otherwise, use an empty list.
([bos_token_id] if bos_token_id is not None else []) +
tokenizer(text)['input_ids'] + [eos_token_id] # Tokenize text and append EOS token.
for text in examples["text"]
]
# -------------------------------------------------------------------------
# Step 3: Concatenate tokenized outputs across the batch.
# Flatten the list of token lists into a single long list using chain.
concatenated_tokens: List[int] = list(chain(*token_lists))
# -------------------------------------------------------------------------
# Step 4: Compute the total length and adjust to an exact multiple of block_size.
total_length: int = len(concatenated_tokens)
# Calculate the number of complete blocks that can be formed.
num_blocks: int = total_length // block_size
# Adjust total_length to discard any extra tokens that do not form a complete block.
total_length = num_blocks * block_size
# -------------------------------------------------------------------------
# Step 5: Split the concatenated token list into blocks of fixed size.
# The list comprehension iterates over the concatenated token list in steps of block_size,
# slicing out a block each time.
all_token_blocks: List[List[int]] = [
concatenated_tokens[i: i + block_size] for i in range(0, total_length, block_size)
]
# -------------------------------------------------------------------------
# Step 6: Create labels for language modeling tasks.
# It is common for language models to use the input_ids as labels.
# Convert the list of token blocks into PyTorch tensors with dtype=torch.long.
import torch
result: Dict[str, torch.Tensor] = {
"input_ids": torch.tensor(all_token_blocks, dtype=torch.long),
"labels": torch.tensor(all_token_blocks.copy(), dtype=torch.long),
}
return result
def main_train(config: dict = {}) -> str:
"""
Trains the model using the provided configuration, saves the final checkpoint (model + tokenizer)
in a dedicated subdirectory, and pushes the final model to the Hugging Face Hub.
Args:
config (dict): Configuration dictionary for training parameters.
Returns:
str: The final model directory path.
"""
from datetime import datetime
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
import os
import torch
from datasets import load_dataset, Dataset
from tfa import TfaCallback
# ------------------------------
# Set device and seed.
# ------------------------------
# export CUDA_VISIBLE_DEVICES=5; python ~/ZIP-FIT/zip_fit/train/train.py
# export CUDA_VISIBLE_DEVICES=7; python ~/ZIP-FIT/zip_fit/train/train.py
# os.environ['CUDA_VISIBLE_DEVICES'] = config.get('cuda_visible_devices', '1') # choose GPU
# os.environ['CUDA_VISIBLE_DEVICES'] = config.get('cuda_visible_devices', '3') # choose GPU
seed: int = config.get('seed', 42)
seed_everything(seed)
# ------------------------------
# Log in to Hugging Face Hub.
# ------------------------------
from huggingface_hub import login, whoami
key_file_path: str = os.path.abspath(os.path.expanduser(config.get('key_file_path', "~/keys/master_hf_token.txt")))
with open(key_file_path, "r", encoding="utf-8") as f:
token: str = f.read().strip()
login(token=token)
os.environ['HUGGINGFACE_TOKEN'] = token
user_info = whoami()
print(f"Currently logged in as: {user_info['name']}\n")
# ------------------------------
# Load model and tokenizer.
# ------------------------------
# model_name: str = config.get('model_name', 'gpt2')
# model_name: str = config.get('model_name', 'Qwen/Qwen2.5-0.5B')
# model_name: str = config.get('model_name', 'google/gemma-2-2b')
# model_name: str = config.get('model_name', 'google/internlm2-math-plus-1_8b')
model_name: str = config.get('model_name', 'meta-llama/Meta-Llama-3-8B')
# model_name: str = config.get('model_name', 'google/codegemma-2b')
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype).to(device) if 'gemma-2' not in model_name else AutoModelForCausalLM.from_pretrained(model_name, attn_implementation='eager').to(device)
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
today: str = config.get('today', datetime.now().strftime('%Y_m%m_d%d_t%Hh_%Mm_%Ss'))
# Use a custom final model name if provided; otherwise use a default.
final_model_name: str = config.get('final_model_name', f'UDACA/{model_name.replace("/", "-")}-pn-v3-lean4-train-on-validation-{today}')
# final_model_name: str = config.get('final_model_name', f'UDACA/{model_name.replace("/", "-")}-pn-v3-lean4-train-on-test-{today}')
final_model_name: str = config.get('final_model_name', f'AI4M/{model_name.replace("/", "-")}-pn-v3-lean4-train-on-validation-{today}')
block_size: int = config.get('block_size', 1024)
print(f'{block_size=}')
# ------------------------------
# Prepare datasets.
# Three dataset views:
# - ds_train and ds_eval: tokenized for CE loss.
# - ds_tf_eval: raw strings for teacher-forced evaluation.
# ------------------------------
def my_prompt_format(nl_stmt: str) -> str:
# Format used for less (see provided URL)
return f'informal statement {nl_stmt}'
# Load datasets with torch format.
# Sanity Checks
"""
export CUDA_VISIBLE_DEVICES=4; python ~/ZIP-FIT/zip_fit/train/train.py --mode dryrun --project zip-fit-pn-sanity --num_train_epochs 3 --model_name meta-llama/Meta-Llama-3-8B
"""
# ds_train: Dataset = load_dataset("UDACA/proofnet-v3-lean4", split="test").with_format('torch')
# ds_train: Dataset = load_dataset("UDACA/proofnet-v3-lean4", split="validation").with_format('torch')
# ds_eval: Dataset = load_dataset("UDACA/proofnet-v3-lean4", split="test").with_format('torch')
# ds_tf_eval: Dataset = load_dataset("UDACA/proofnet-v3-lean4", split="test").with_format('torch')
# Lean4AI
"""
export CUDA_VISIBLE_DEVICES=4; python ~/ZIP-FIT/zip_fit/train/train.py --mode dryrun --project lean4ai-llama3-8b-runs --num_train_epochs 3 --model_name meta-llama/Meta-Llama-3-8B
"""
# ds_train: Dataset = load_dataset("AI4M/mma-dataset", split="train").with_format('torch')
# ds_train: Dataset = load_dataset("AI4M/stateInfoInformalizationBig", split="train").with_format('torch')
# ds_train: Dataset = load_dataset("AI4M/regexInformalizationData", split="train").with_format('torch')
# ds_train: Dataset = load_dataset("AI4M/gpt4-more", split="train").with_format('torch')
# ds_train: Dataset = load_dataset("AI4M/gpt4-more", split="train").with_format('torch')
"""
conda activate zip_fit
conda activate zip_fit
export CUDA_VISIBLE_DEVICES=0
# python ~/ZIP-FIT/zip_fit/train/train.py --mode online --project self-opt-train-uncompiled-py-2-gsm8k --num_train_epochs 1 --model_name gpt2
# python ~/ZIP-FIT/zip_fit/train/train.py --mode dryrun --project self-opt-train-uncompiled-py-2-gsm8k --num_train_epochs 1 --model_name google/gemma-2-2b
python ~/ZIP-FIT/zip_fit/train/train.py --mode dryrun --project self-opt-train-uncompiled-py-2-gsm8k --num_train_epochs 1 --model_name meta-llama/Meta-Llama-3-8B
"""
def my_prompt_format(question: str, answer: str, final_answer: str)-> str:
return f'question: {question}\nanswer: {answer}\n### {final_answer}'
# Path to your saved JSON file
# ds_train = load_dataset("json", data_files=os.path.expanduser("~/data/synthetic_data/uncompiled_dspy/syndata_100_2025_m02_d05_t18h_29m_01s.json"))
ds_train = load_dataset("json", split='train', data_files=os.path.expanduser("~/data/synthetic_data/uncompiled_dspy/syndata_18612_2025_m02_d05_t18h_38m_11s.json"))
# Evals
ds_eval: Dataset = load_dataset("openai/gsm8k", 'main', split="test").with_format('torch')
ds_tf_eval: Dataset = load_dataset("openai/gsm8k", 'main', split="test").with_format('torch')
# Create a new "text" field for tokenized datasets by concatenating formatted prompt and formal statement.
ds_train = ds_train.map(
# lambda batch: {"text": [my_prompt_format(nl) + formal for nl, formal in zip(batch["y_grade_school_math_question"], batch["y_grade_school_math_solution"], batch["y_grade_school_math_final_answer"])]},
lambda batch: {"text": [my_prompt_format(q, a, fa) for q, a, fa in zip(batch["y_grade_school_math_question"], batch["y_grade_school_math_solution"], batch["y_grade_school_math_final_answer"])]},
batched=True,
remove_columns=ds_train.column_names, # Remove all original columns.
num_proc=48,
)
print(f'{len(ds_train)=}')
# Tokenize and group text for ds_train.
ds_train = ds_train.map(
lambda batch: tokenize_and_group_texts_via_blocks(batch, tokenizer=tokenizer, block_size=block_size),
batched=True,
remove_columns=ds_train.column_names,
# remove_columns=['text'],
num_proc=48,
)
print(f'{len(ds_train)=}')
# Do the same for ds_eval.
ds_eval = ds_eval.map(
# lambda batch: {"text": [my_prompt_format(nl) + formal for nl, formal in zip(batch["nl_statement"], batch["formal_statement"])]},
lambda batch: {"text": [f'question: {q}\nanswer: {a}' for q, a in zip(batch["question"], batch["answer"])]},
batched=True,
remove_columns=ds_eval.column_names, # Remove all original columns.
num_proc=48,
)
ds_eval = ds_eval.map(
lambda batch: tokenize_and_group_texts_via_blocks(batch, tokenizer=tokenizer, block_size=block_size),
batched=True,
remove_columns=ds_eval.column_names,
num_proc=48,
)
# For teacher-forced evaluation, retain raw strings by creating 'prompt' and 'gold_response' fields.
ds_tf_eval = ds_tf_eval.map(
lambda batch: {
# 'prompt': [my_prompt_format(nl) for nl in batch['nl_statement']],
# 'gold_response': [formal for formal in batch['formal_statement']]
'prompt': [f'question: {q}\nanswer: ' for q in batch['question']],
'gold_response': [answer for answer in batch['answer']]
},
batched=True,
num_proc=48
)
# ------------------------------
# Define training arguments.
# ------------------------------
# Create the main output directory.
output_dir: Path = Path(f'~/data/zipfit_less_runs/tfa_output_{today}').expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
training_args = TrainingArguments(
# max_steps=2, # for debugging
output_dir=str(output_dir), # Main output directory.
do_train=True,
# num_train_epochs=config.get('num_train_epochs', 3), # Lean4AI Total training epochs.
num_train_epochs=config.get('num_train_epochs', 1), # Total training epochs.
do_eval=True,
eval_on_start=config.get('eval_on_start', True), # Evaluate before training starts.
evaluation_strategy=config.get('evaluation_strategy', "steps"), # Evaluate every few steps.
eval_steps=config.get('eval_steps', 1), # Evaluate after every 25 steps.
logging_steps=config.get('logging_steps', 1), # Log metrics every 25 steps.
per_device_train_batch_size=config.get('per_device_train_batch_size', 2),
gradient_accumulation_steps=config.get('gradient_accumulation_steps', 4),
save_steps=config.get('save_steps', 25), # Save a checkpoint every 100 steps.
save_total_limit=config.get('save_total_limit', 1), # Keep only the most recent checkpoint.
save_strategy=config.get('save_strategy', 'steps'),
bf16=torch.cuda.is_bf16_supported(), # Use bf16 if supported.
fp16=not torch.cuda.is_bf16_supported(), # Otherwise, use fp16.
# optim=config.get('optim', 'adamw_torch'),
optim=config.get('optim', 'paged_adamw_32bit'),
learning_rate=config.get('learning_rate', 1e-6),
weight_decay=config.get('weight_decay', 1e-4),
gradient_checkpointing=config.get('gradient_checkpointing', True), # careful, this can give issues depedning on hardware
lr_scheduler_type=config.get('lr_scheduler_type', 'constant_with_warmup'),
warmup_ratio=config.get('warmup_ratio', 0.05),
seed=seed,
data_seed=config.get('data_seed', seed),
# torch_compile=True,
# Hub push parameters.
# push_to_hub=True,
# hub_model_id=final_model_name,
)
# ------------------------------
# Attach teacher-forced evaluation callback.
# ------------------------------
tfa_callback = TfaCallback(
tfa_dataset=ds_tf_eval,
repo=model_name,
n_begin=config.get('n_begin', 184), # Use full eval set at beginning.
n_during=config.get('n_during', 4), # Partial eval during training to save time.
n_end=config.get('n_end', 184) # Use full eval set at end.
)
# ------------------------------
# Build the Trainer.
# ------------------------------
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds_train,
eval_dataset=ds_eval,
callbacks=[tfa_callback],
tokenizer=tokenizer, # Ensure tokenizer is saved and pushed.
)
# ------------------------------
# Run training.
# ------------------------------
trainer.train()
# ------------------------------
# Save final model and tokenizer in a dedicated subdirectory.
# ------------------------------
final_model_dir: Path = output_dir / final_model_name
final_model_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(final_model_dir)) # Saves both model and tokenizer.
tokenizer.save_pretrained(str(final_model_dir)) # Extra precaution.
# ------------------------------
# Push the final model (and tokenizer) to the Hub.
# The push will be blocking by default unless 'blocking' is overridden in config.
# ------------------------------
trainer.push_to_hub(commit_message="Final model checkpoint", blocking=config.get('blocking', True))
# Construct the final model URL and print it.
final_model_url: str = f"https://huggingface.co/{final_model_name}"
print(f"\nFinal model URL: {final_model_url}\n\n")
# Clean
del trainer
import gc
del model
gc.collect()
# Return the final model directory path as a string.
results = {'final_model_url': final_model_url, 'final_model_dir': final_model_dir}
return results
def main_full_run(config: dict = {}):
from lean_pass_k_unbiased import run_pass_k_eval
import os
import wandb
seed: int = config.get('seed', 42)
seed_everything(seed)
os.environ['CUDA_VISIBLE_DEVICES'] = config.get('cuda_visible_devices', '1') # choose GPU
# Log In
from huggingface_hub import login, whoami
key_file_path = "~/keys/master_hf_token.txt"
key_file_path = os.path.abspath(os.path.expanduser(key_file_path))
with open(key_file_path, "r", encoding="utf-8") as f:
token = f.read().strip()
login(token=token)
os.environ['HUGGINGFACE_TOKEN'] = token
user_info = whoami()
print(f"Currently logged in as: {user_info['name']}\n")
# Run params
new_seed = config.get('seed', 42)
k, N = 5, 6
model_name: str = config.get('model_name', 'gpt2')
# model_name: str = config.get('model_name', 'Qwen/Qwen2.5-0.5B')
# model_name: str = config.get('model_name', 'google/gemma-2-2b')
# model_name: str = config.get('model_name', 'google/internlm2-math-plus-1_8b')
# model_name: str = config.get('model_name', 'Meta-Llama-3-8B')
# model_name: str = config.get('model_name', 'google/codegemma-2b')
# ---------------------------------------------------------------------
# Initialize the Lean 4 server via PyPantograph.
from pantograph import Server
server = Server(imports=["Mathlib", "Init"], project_path=os.path.expanduser("~/mathlib4"))
# ---------------------------------------------------------------------
# Load prompts and gold headers from a dataset.
# We assume that our prompt formatting function is defined below.
def my_prompt_format(nl_stmt: str) -> str:
return (
"Your task is translate the natural language version of the mathematical statement "
"to a formal Lean statement version, using the following format:\n"
"natural language statement:\nLet $z=\\frac{1+i}{\\sqrt{2}}.$What is $\\left(z^{1^2}+z^{2^2}+z^{3^2}+\\dots+z^{{12}^2}\\right) \\cdot "
"\\left(\\frac{1}{z^{1^2}}+\\frac{1}{z^{2^2}}+\\frac{1}{z^{3^2}}+\\dots+\\frac{1}{z^{{12}^2}}\\right)?$ "
"$\\textbf{(A)}\\ 18 \\qquad \\textbf{(B)}\\ 72-36\\sqrt2 \\qquad \\textbf{(C)}\\ 36 \\qquad \\textbf{(D)}\\ 72 \\qquad \\textbf{(E)}\\ 72+36\\sqrt2$ "
"Show that it is \\textbf{(C)}\\ 36.\n"
"formal Lean language statement:##\ntheorem amc12a_2019_p21 (z : ℂ) (h₀ : z = (1 + Complex.I) / Real.sqrt 2) : "
"(∑ k in Finset.Icc 1 12, (z^(k^2))) * (∑ k in Finset.Icc 1 12, (1 / z^(k^2))) = 36 := sorry\n##"
"natural language statement:\nIntegers $x$ and $y$ with $x>y>0$ satisfy $x+y+xy=80$. What is $x$? "
"$\\textbf{(A)}\\ 8 \\qquad\\textbf{(B)}\\ 10 \\qquad\\textbf{(C)}\\ 15 \\qquad\\textbf{(D)}\\ 18 \\qquad\\textbf{(E)}\\ 26$ Show that it is \\textbf{(E)}\\ 26.\n"
"formal Lean language statement:##\ntheorem amc12a_2015_p10 (x y : ℤ) (h₀ : 0 < y) (h₁ : y < x) "
"(h₂ : x + y + (x * y) = 80) : x = 26 := sorry\n##"
"natural language statement:\nWhat is the [[volume]] of a [[cube]] whose [[surface area]] is twice that of a cube with volume 1? "
"$\\mathrm{(A)}\\ \\sqrt{2}\\qquad\\mathrm{(B)}\\ 2\\qquad\\mathrm{(C)}\\ 2\\sqrt{2}\\qquad\\mathrm{(D)}\\ 4\\qquad\\mathrm{(E)}\\ 8$ Show that it is \\mathrm{(C)}.\n"
"formal Lean language statement:##\ntheorem amc12a_2008_p8 (x y : ℝ) (h₀ : 0 < x ∧ 0 < y) "
"(h₁ : y^3 = 1) (h₂ : 6 * x^2 = 2 * (6 * y^2)) : x^3 = 2 * Real.sqrt 2 := sorry\n##"
f"natural language statement:\n{nl_stmt}\n"
"formal Lean language statement:"
)
from datasets import load_dataset
ds_test = load_dataset('UDACA/proofnet-v3-lean4', split='test')
ds_test = ds_test.select(list(range(2))) # optionally select a subset
prompts = [my_prompt_format(row['nl_statement']) for row in ds_test]
gold_headers = [row['header_no_import'] for row in ds_test]
# Record the start time for this repetition.
start_time = time.time()
# Call run_pass_k_eval which:
# - Generates 'N' completions per prompt,
# - Checks each for correctness (e.g., compilation),
# - Computes and returns the overall pass@k score.
pass_k_before_train = run_pass_k_eval(
prompts=prompts,
model_name=model_name,
server=server,
headers=gold_headers,
k=k,
num_samples=N,
eval_batch_size=32,
seed=new_seed,
debug=False
)
# Record the end time and compute the elapsed time for this repetition.
end_time = time.time()
elapsed_time = end_time - start_time
print(f'{pass_k_before_train=}')
print(f'{elapsed_time=}')
wandb.log('pass_k_before_train', pass_k_before_train)
# Train
config = config | {'model_name': model_name}
results = main_train(config)
model_name = results['final_model_dir']
# Record the start time for this repetition.
start_time = time.time()
# Call run_pass_k_eval which:
# - Generates 'N' completions per prompt,
# - Checks each for correctness (e.g., compilation),
# - Computes and returns the overall pass@k score.
pass_k_before_train = run_pass_k_eval(
prompts=prompts,
model_name=model_name,
server=server,
headers=gold_headers,
k=k,
num_samples=N,
eval_batch_size=32,
seed=new_seed,
debug=False
)
# Record the end time and compute the elapsed time for this repetition.
end_time = time.time()
elapsed_time = end_time - start_time
print(f'{pass_k_before_train=}')
print(f'{elapsed_time=}')
wandb.log('pass_k_before_train', pass_k_before_train)
def get_current_tmux_session_number() -> str:
import os
# Executes "tmux display-message -p '#S'" to retrieve the current tmux session name/number,
# reads the output from the command, strips any surrounding whitespace, and returns it.
return os.popen("tmux display-message -p '#S'").read().strip()
def _main(**kwargs):
from datetime import datetime
from socket import gethostname
import wandb
today = datetime.now().strftime('%Y_m%m_d%d_t%Hh_%Mm_%Ss') # eg '2024_m01_d22_t13h_00m_30s'
# os.environ['CUDA_VISIBLE_DEVICES'] = str(kwargs.get('CUDA_VISIBLE_DEVICES', '7'))
tmux_sess_num = None
kwargs = kwargs | {'today': today, 'tmux_sess_num': tmux_sess_num, 'hostname': gethostname()}
run_name = f'{kwargs}'
project: str = kwargs.get('project', 'zip-fit-train')
# run = wandb.init(mode=kwargs.get('mode', 'online'), project="zip-fit-train", name=run_name, save_code=True, config=kwargs)
run = wandb.init(mode=kwargs.get('mode', 'dryrun'), project=project, name=run_name, save_code=True, config=kwargs)
wandb.save(__file__) # save current code now, don't wait to wandb.finish, also useful: wandb.save("*.py") # upload all .py files in current directory
print(f'Kwargs to run:\n{kwargs}')
# main(kwargs)
main_train(kwargs)
# main_full_run(kwargs)
run.alert(title="Run Completed", text=f"Run finished, run url: {run.get_url()}")
print(f'{run.get_url()=}')
wandb.finish()
if __name__ == "__main__":
import fire
import time
start_time = time.time()
fire.Fire(_main)
print(f"\aTime taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment