Skip to content

Instantly share code, notes, and snippets.

View kasperjunge's full-sized avatar

Kasper Junge kasperjunge

View GitHub Profile
@kasperjunge
kasperjunge / download_n_eb_articles.py
Created July 26, 2022 07:19
Download n Ekstra Bladet new articles from the Danish mC4 dataset.
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
def download_n_eb_articles(n: int) -> pd.DataFrame:
"""Extract n Ekstra Bladet articles from the Danish subset
of the mC4 dataset.
Args:
n (int): Number of articles to extract.
Returns:
@kasperjunge
kasperjunge / flatten_list.py
Last active July 29, 2022 11:04
Flatten list
from typing import List, Any
def flatten_list(list_of_lists: List[List[Any]]) -> List[Any]:
"""Merge/flatten a list of lists into one single list.
Example: [[1, 2, 3],[4, 5, 6]] --> [1, 2, 3, 4, 5, 6]
Args:
list_of_lists (List[list]): List of lists to be merged/flattened.
Returns:
@kasperjunge
kasperjunge / danish_sota_ner.py
Last active August 16, 2022 06:45
Danish SOTA NER
import pandas as pd
from transformers import pipeline
text = """
Dan Saattrup Nielsen arbejder som AI Specialist hos Alexandra Instituttet
og er han næstformand i Dansk Data Science Community.
"""
ner = pipeline(
task="ner",
@kasperjunge
kasperjunge / dataframe_to_huggingface_dataset.py
Created July 29, 2022 08:33
Convert at dataset stored as a pandas dataframe to a Hugging Face DatasetDict.
import pandas as pd
from datasets import Dataset, DatasetDict
def dataframe_to_huggingface_dataset(df: pd.DataFrame) -> DatasetDict:
"""Convert at dataframe with split column into Hugging Face DatasetDict.
Args:
df (pd.DataFrame): Dataset stored in a pandas dataframe with a
column named "split" that tells what split a datapoint belongs to.
@kasperjunge
kasperjunge / tokenizer_huggingface_dataset.py
Last active July 29, 2022 09:32
Tokenize Hugging Face Dataset or DatasetDict.
from typing import Union
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
def tokenize_huggingface_dataset(
ds: Union[Dataset, DatasetDict],
tokenizer: AutoTokenizer,
max_length: int = 512,
truncation: bool = True,
) -> Union[Dataset, DatasetDict]:
@kasperjunge
kasperjunge / print_wordpiece_tokens.py
Last active August 21, 2022 14:08
Tokenize text and print encoded + decoded wordpiece tokens.
from transformers import AutoTokenizer
# define sample text
text = "Rødgrød med fløde."
# init tokenizer
model_id = "Maltehb/danish-bert-botxo"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# encode text
@kasperjunge
kasperjunge / measure_function_speed.py
Created July 29, 2022 14:35
Measure function speed 🏃‍♂️
import time
import statistics
from typing import Callable, Any, Tuple
def time_function(func: Callable, func_input: Any, n_runs: int) -> Tuple[float]:
times = []
for _ in range(n_runs):
start = time.perf_counter()
func(func_input)
times.append(time.perf_counter() - start)
@kasperjunge
kasperjunge / gist:4ec27724272109b9a413e4d1477ef7cd
Created December 8, 2022 07:36
ChatGPT answer to: Implement me a hyper-optimized service for serving predictions by transformer models
from fastapi import FastAPI
import transformers
import torch
app = FastAPI()
# Load the tokenizer and model
tokenizer = transformers.AutoTokenizer.from_pretrained("model_name")
model = transformers.AutoModelForTokenClassification.from_pretrained("model_name")
@kasperjunge
kasperjunge / chunk_df.py
Last active April 18, 2023 15:32
Chunk dataframe generator
import pandas as pd
def chunk_dataframe(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame:
for i in range(0, len(df), chunk_size):
yield df[i:i+chunk_size]
@kasperjunge
kasperjunge / pydantic_to_openai_function_calling.py
Created January 25, 2024 20:57
Pydantic to OpenAI Function Calling
from pydantic import BaseModel, Field
import json
class ReceiptDataExtractor(BaseModel):
date: str = Field(description="The date.")
amount: float = Field(description="The total amount.")
supplier: str = Field(description="The supplier.")
class Config:
function_meta = {