Skip to content

Instantly share code, notes, and snippets.

View BramVanroy's full-sized avatar

Bram Vanroy BramVanroy

View GitHub Profile
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@BramVanroy
BramVanroy / benchmark.py
Last active May 29, 2024 11:33
Fast method of "first-fit-decreasing" packing benchmark. Around 5x faster than baseline. Baseline taken from https://huggingface.co/DiscoResearch/Llama3-German-8B#document-packing. Note that memory usage will be higher in the optimized version.
import gc
import numpy as np
import time
import pandas as pd
from tqdm import tqdm
def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True):
@BramVanroy
BramVanroy / embed.py
Last active October 9, 2024 13:54
Getting word embeddings
from dataclasses import dataclass, field
import torch
from torch import LongTensor, Tensor
from transformers import (
AutoTokenizer,
AutoModel,
PreTrainedModel,
PreTrainedTokenizer,
BatchEncoding,
@BramVanroy
BramVanroy / fw2-not-unique-ids.py
Created January 18, 2025 20:16
FineWeb-2 IDs are not unique
from typing import Counter
from datasets import load_dataset
from tqdm import tqdm
ds = load_dataset("HuggingFaceFW/fineweb-2", "nld_Latn", split="train")
ds_size = len(ds)
print(f"Dataset size: {ds_size:,}") #
counts = Counter(ds["id"])