This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from tqdm import tqdm | |
from datasets import load_dataset | |
def download_n_eb_articles(n: int) -> pd.DataFrame: | |
"""Extract n Ekstra Bladet articles from the Danish subset | |
of the mC4 dataset. | |
Args: | |
n (int): Number of articles to extract. | |
Returns: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List, Any | |
def flatten_list(list_of_lists: List[List[Any]]) -> List[Any]: | |
"""Merge/flatten a list of lists into one single list. | |
Example: [[1, 2, 3],[4, 5, 6]] --> [1, 2, 3, 4, 5, 6] | |
Args: | |
list_of_lists (List[list]): List of lists to be merged/flattened. | |
Returns: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from transformers import pipeline | |
text = """ | |
Dan Saattrup Nielsen arbejder som AI Specialist hos Alexandra Instituttet | |
og er han næstformand i Dansk Data Science Community. | |
""" | |
ner = pipeline( | |
task="ner", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from datasets import Dataset, DatasetDict | |
def dataframe_to_huggingface_dataset(df: pd.DataFrame) -> DatasetDict: | |
"""Convert at dataframe with split column into Hugging Face DatasetDict. | |
Args: | |
df (pd.DataFrame): Dataset stored in a pandas dataframe with a | |
column named "split" that tells what split a datapoint belongs to. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Union | |
from transformers import AutoTokenizer | |
from datasets import Dataset, DatasetDict | |
def tokenize_huggingface_dataset( | |
ds: Union[Dataset, DatasetDict], | |
tokenizer: AutoTokenizer, | |
max_length: int = 512, | |
truncation: bool = True, | |
) -> Union[Dataset, DatasetDict]: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
# define sample text | |
text = "Rødgrød med fløde." | |
# init tokenizer | |
model_id = "Maltehb/danish-bert-botxo" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# encode text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import statistics | |
from typing import Callable, Any, Tuple | |
def time_function(func: Callable, func_input: Any, n_runs: int) -> Tuple[float]: | |
times = [] | |
for _ in range(n_runs): | |
start = time.perf_counter() | |
func(func_input) | |
times.append(time.perf_counter() - start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fastapi import FastAPI | |
import transformers | |
import torch | |
app = FastAPI() | |
# Load the tokenizer and model | |
tokenizer = transformers.AutoTokenizer.from_pretrained("model_name") | |
model = transformers.AutoModelForTokenClassification.from_pretrained("model_name") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def chunk_dataframe(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame: | |
for i in range(0, len(df), chunk_size): | |
yield df[i:i+chunk_size] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydantic import BaseModel, Field | |
import json | |
class ReceiptDataExtractor(BaseModel): | |
date: str = Field(description="The date.") | |
amount: float = Field(description="The total amount.") | |
supplier: str = Field(description="The supplier.") | |
class Config: | |
function_meta = { |