Skip to content

Instantly share code, notes, and snippets.

View pszemraj's full-sized avatar

Peter pszemraj

View GitHub Profile
@pszemraj
pszemraj / datasets_split.py
Created March 12, 2024 07:03
hf datasets train_test_split with stratify_by_column for any type (by tricking it)
import os
import numpy as np
from datasets import ClassLabel, Dataset, DatasetDict
def split_dataset(
dataset: Dataset,
test_size=0.025,
@pszemraj
pszemraj / enable_tf32.py
Last active September 5, 2025 04:55
modern way to auto enable tf32
import logging
import torch
def configure_tf32():
"""
Enable TF32 precision for GPUs with compute capability >= 8.0 (Ampere+).
"""
if not torch.cuda.is_available():
@pszemraj
pszemraj / hf_repofolder_watchdog.py
Created January 16, 2024 02:53
upload a folder to Hugging Face Hub and other utils
import argparse
import logging
import time
from datetime import datetime
from pathlib import Path
from typing import Optional
from huggingface_hub import upload_folder
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
@pszemraj
pszemraj / calculate_code_readability.py
Created November 6, 2023 17:01
heuristics for language agnostic code readability index
import re
from itertools import chain
def calculate_readability(code_string:str) -> float:
code = code_string.splitlines()
# Heuristic 1: Line length
max_line_length = 80
long_lines = sum(1 for line in code if len(line) > max_line_length)
@pszemraj
pszemraj / generic_embedder.py
Last active December 12, 2023 05:18
generic & basic sbert-like embedder class for the jina-bert model
"""
generic & basic sbert-like embedder class for the jina-bert model
Usage:
model = EmbeddingModel("jinaai/jina-embeddings-v2-base-en")
embeddings = model.encode(
["How is the weather today?", "What is the current weather like today?"]
)
print(model.cos_sim(embeddings[0], embeddings[1]))
@pszemraj
pszemraj / download_URLs_in_file.py
Last active January 23, 2024 07:10
pdf downloading utils
import os
import argparse
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from joblib import Parallel, delayed
from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
@pszemraj
pszemraj / train_tokenizer.py
Created September 6, 2023 02:31
train tokenizer hf tokenizers - WIP script
import logging
import gzip
from pathlib import Path
import fire
from tqdm import tqdm
from tokenizers import (
Tokenizer,
decoders,
models,
normalizers,
@pszemraj
pszemraj / nougat_em.sh
Last active January 23, 2024 07:12
bash script to apply facebookresearch/nougat on a directory of PDFs
#!/bin/bash
# pip install nougat-ocr
# see https://github.com/facebookresearch/nougat for details and license
DEFAULT_BATCHSIZE=4
usage() {
echo "Usage: $0 <path_to_directory> [--batchsize BATCHSIZE]"
exit 1
@mlabonne
mlabonne / merge_peft.py
Last active May 29, 2025 13:58
Merge base model and peft adapter and push it to HF hub
# Example usage:
# python merge_peft.py --base_model=meta-llama/Llama-2-7b-hf --peft_model=./qlora-out --hub_id=alpaca-qlora
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import argparse
def get_args():
@xenova
xenova / tiktoken-to-hf.ipynb
Last active October 13, 2025 17:58
Convert tiktoken tokenizers to the Hugging Face tokenizers format
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.