Skip to content

Instantly share code, notes, and snippets.

View pszemraj's full-sized avatar

Peter pszemraj

View GitHub Profile
@pszemraj
pszemraj / calculate_code_readability.py
Created November 6, 2023 17:01
heuristics for language agnostic code readability index
import re
from itertools import chain
def calculate_readability(code_string:str) -> float:
code = code_string.splitlines()
# Heuristic 1: Line length
max_line_length = 80
long_lines = sum(1 for line in code if len(line) > max_line_length)
@pszemraj
pszemraj / generic_embedder.py
Last active December 12, 2023 05:18
generic & basic sbert-like embedder class for the jina-bert model
"""
generic & basic sbert-like embedder class for the jina-bert model
Usage:
model = EmbeddingModel("jinaai/jina-embeddings-v2-base-en")
embeddings = model.encode(
["How is the weather today?", "What is the current weather like today?"]
)
print(model.cos_sim(embeddings[0], embeddings[1]))
@pszemraj
pszemraj / download_URLs_in_file.py
Last active January 23, 2024 07:10
pdf downloading utils
import os
import argparse
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from joblib import Parallel, delayed
from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
@pszemraj
pszemraj / train_tokenizer.py
Created September 6, 2023 02:31
train tokenizer hf tokenizers - WIP script
import logging
import gzip
from pathlib import Path
import fire
from tqdm import tqdm
from tokenizers import (
Tokenizer,
decoders,
models,
normalizers,
@pszemraj
pszemraj / nougat_em.sh
Last active January 23, 2024 07:12
bash script to apply facebookresearch/nougat on a directory of PDFs
#!/bin/bash
# pip install nougat-ocr
# see https://github.com/facebookresearch/nougat for details and license
DEFAULT_BATCHSIZE=4
usage() {
echo "Usage: $0 <path_to_directory> [--batchsize BATCHSIZE]"
exit 1
@mlabonne
mlabonne / merge_peft.py
Last active May 29, 2025 13:58
Merge base model and peft adapter and push it to HF hub
# Example usage:
# python merge_peft.py --base_model=meta-llama/Llama-2-7b-hf --peft_model=./qlora-out --hub_id=alpaca-qlora
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import argparse
def get_args():
@xenova
xenova / tiktoken-to-hf.ipynb
Last active April 7, 2025 20:13
Convert tiktoken tokenizers to the Hugging Face tokenizers format
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@pszemraj
pszemraj / lfs_checkpoint_uploader.sh
Last active July 22, 2023 05:43
for when huggingface trainer decides your files are too big to track intermediate chkpts during training
#!/bin/bash
# install: sudo apt-get install inotify-tools
# Usage: ./scriptname.sh /path/to/monitor/directory /path/to/repo/directory
# If no monitor directory is passed, monitor directory = repo directory
# put & at the end of the command to run in background
# Define your monitor directory
MONITOR_DIR="${1:-$2}"
if [ -z "$MONITOR_DIR" ]; then
@younesbelkada
younesbelkada / finetune_llama_v2.py
Last active July 1, 2025 23:14
Fine tune Llama v2 models on Guanaco Dataset
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
import logging
import warnings
from typing import List, Optional, Union
import numpy as np
import torch
from torch.nn import functional as F
from tqdm.auto import trange
from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, RwkvModel