Skip to content

Instantly share code, notes, and snippets.

View ParagEkbote's full-sized avatar
🪧
https://paragekbote.github.io/

Parag Ekbote ParagEkbote

🪧
https://paragekbote.github.io/
View GitHub Profile
@ParagEkbote
ParagEkbote / pr_insights.py
Created March 9, 2026 09:57
Open-Source Contributions Insight
import os
import json
import subprocess
import requests
from datetime import datetime
from statistics import mean, median, stdev
from collections import defaultdict
from pathlib import Path
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
@ParagEkbote
ParagEkbote / compare_tokenizer_perf.py
Last active November 17, 2025 15:25
FineWiki Multilingual Tokenizer Analysis based on the Smol Training Playbook: https://huggingface.co/spaces/HuggingFaceTB/smol-training-playbook#the-tokenizer
#!/usr/bin/env python3
"""
Tokenizer Performance Comparison Script
A comprehensive tool for evaluating tokenizer efficiency across multiple languages.
Supports HuggingFace datasets, local files, and built-in samples.
Author: https://github.com/ParagEkbote
"""
from typing import Tuple, List, Dict, Optional, Callable, Literal
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from pruna import smash, SmashConfig
# Define per-model Smash + torch.compile configs
models = {
"HuggingFaceTB/SmolLM2-360M": {
"bits": 4, "group_size": 64,
"compiler": "torch_compile",
"torch_compile_mode": "max-autotune",
import os
import torch
import argparse
from datetime import datetime
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from pruna import smash, SmashConfig
from pruna.data.pruna_datamodule import PrunaDataModule
from pruna.evaluation.evaluation_agent import EvaluationAgent
from pruna.evaluation.task import Task